#include "KernelRepository.h"

const int KernelRepository::AuxiliaryStructures = 0;
const int KernelRepository::BinarySearch = 1;
const int KernelRepository::Finalization = 2;
const int KernelRepository::Finalization_long = 3;
const int KernelRepository::LogarithmUtils = 4;
const int KernelRepository::MemoryTransfers = 5;
const int KernelRepository::MultipleConsistencyUtils = 6;
const int KernelRepository::MultipleConsistency_old = 7;
const int KernelRepository::MultipleFinalization = 8;
const int KernelRepository::MultiplePartition = 9;
const int KernelRepository::MultipleProbabilistic = 10;
const int KernelRepository::MultipleSparse = 11;
const int KernelRepository::Partition = 12;
const int KernelRepository::PartitionLogarithm = 13;
const int KernelRepository::PartitionLogarithm_fast = 14;
const int KernelRepository::PartitionLogarithm_long = 15;
const int KernelRepository::PartitionLogarithm_long_fast = 16;
const int KernelRepository::Partition_long = 17;
const int KernelRepository::Probabilistic = 18;
const int KernelRepository::Probabilistic_long = 19;
const int KernelRepository::Random = 20;
const int KernelRepository::ScoreType = 21;
const int KernelRepository::SparseMatrix = 22;
const int KernelRepository::SparseMatrix_long = 23;
const int KernelRepository::Utils = 24;
char * KernelRepository::kernels[] = {
"#pragma once\n"
"\n"
"// some stuff related to global memory pattern\n"
"#define jagSize_float	32\n"
"#define jagSize_double	16\n"
"\n"
"#define JAG_FLOAT		32\n"
"#define JAG_DOUBLE		16\n"
"\n"
"\n"
"int jaggedSize(int width, int height, int jag)\n"
"{\n"
"return (height + jag - 1) * _ceildiv(width, jag) * jag;\n"
"}\n"
"\n"
"int jaggedIndex(int x, int y, int height, int jag)\n"
"{\n"
"int local_x = x % jag;\n"
"int local_offset = (local_x + y) * jag + local_x;\n"
"int global_offset = (x / jag) * (height + jag - 1) * jag;\n"
"\n"
"return global_offset + local_offset;\n"
"}\n"
"\n"
"#define jaggedIndex_float(x,y,height) jaggedIndex(x,y,height, jagSize_float)\n"
"\n"
"/*\n"
"int jaggedIndex_float(int x, int y, int height)\n"
"{\n"
"int local_x = x & 31;\n"
"int local_offset = (local_x + y) * 32 + local_x;\n"
"int global_offset = (x >> 5) * (height + 31) << 5;\n"
"\n"
"return global_offset + local_offset;\n"
"}\n"
"*/\n"
"\n"
"struct PosteriorSchedule\n"
"{\n"
"int taskCount;\n"
"};\n"
"\n"
"\n"
"struct RelaxationSchedule\n"
"{\n"
"int numSeqs;\n"
"int sparseWidth;\n"
"int sequenceBegin;\n"
"int sequenceEnd;\n"
"};\n"
"\n"
"struct PosteriorTask\n"
"{\n"
"unsigned int i;	// first sequence index\n"
"unsigned int j;	// second sequence index\n"
"unsigned int offset_Aij;	// auxiliary buffer offset\n"
"unsigned int offset_Pij;	// posterior Pij matrix offset\n"
"unsigned int numCells;	// size of sparse matrix\n"
"float pid;\n"
"};\n"
"\n"
"struct RelaxationTask\n"
"{\n"
"unsigned short i;	// first sequence index\n"
"unsigned short j;	// second sequence index\n"
"unsigned short seed;			// random number generator seed\n"
"unsigned short acceptedCount;	// number of sequences accepted by selectivity\n"
"};\n"
"\n"
,
"#pragma once\n"
"\n"
"\n"
"int linear_search()\n"
"{\n"
"int hi = -1;\n"
"for (int i = 0; i <= Sxy_size; ++i) {\n"
"hi = (Sxy_row[i].column == Szy_element.column) ? i : hi;\n"
"}\n"
"\n"
"if (hi >= 0) {\n"
"Sxy_row[hi].value += w * Sxz_elements[k].value * Szy_element.value;\n"
"}\n"
"}\n"
"\n"
"\n"
"int binary_search()\n"
"{\n"
"int lo = 0;\n"
"int hi = Sxy_size;\n"
"\n"
"while (lo < hi) {\n"
"int mid = hadd(lo, hi); // same as (lo + hi)>> 1;\n"
"if (Sxy_row[mid].column >= Szy_element.column) {\n"
"hi = mid;\n"
"} else {\n"
"lo = mid + 1;\n"
"}\n"
"}\n"
"\n"
"if (Sxy_row[hi].column == Szy_element.column) {\n"
"Sxy_row[hi].value += w * Sxz_elements[k].value * Szy_element.value;\n"
"}\n"
"}\n"
"\n"
"\n"
"int binary_search_store()\n"
"{\n"
"int lo = 0;\n"
"int hi = Sxy_size;\n"
"int midColumn = 0;\n"
"\n"
"while (lo < hi) {\n"
"int mid = hadd(lo, hi); // same as (lo + hi)>> 1;\n"
"midColumn = Sxy_row[mid].column;\n"
"if (midColumn >= Szy_element.column) {\n"
"hi = mid;\n"
"} else {\n"
"lo = mid + 1;\n"
"}\n"
"}\n"
"\n"
"if (midColumn == Szy_element.column) {\n"
"Sxy_row[hi].value += w * Sxz_elements[k].value * Szy_element.value;\n"
"}\n"
"}\n"
"\n"
"\n"
"int binary_search_unrolled()\n"
"{\n"
"unsigned int hi = Sxy_size;\n"
"unsigned int id = sub_sat(hi, 8u);\n"
"\n"
"while (Sxy_row[id].column > Szy_element.column)\n"
"id = sub_sat(id, 16u);\n"
"\n"
"id = Szy_element.column < Sxy_row[id].column ? sub_sat(id, 4u) : min(id + 4, hi);\n"
"id = Szy_element.column < Sxy_row[id].column ? sub_sat(id, 2u) : min(id + 2, hi);\n"
"id = Szy_element.column < Sxy_row[id].column ? sub_sat(id, 1u) : min(id + 1, hi);\n"
"id = Szy_element.column < Sxy_row[id].column ? sub_sat(id, 1u) : min(id + 0, hi);\n"
"\n"
"if (Sxy_row[id].column == Szy_element.column) {\n"
"Sxy_row[id].value += w * Sxz_elements[k].value * Szy_element.value;\n"
"break;\n"
"}\n"
"}\n"
"\n"
"\n"
"void binary_array() {\n"
"\n"
"// set starting position in binary search\n"
"int initial_id = (1 << (32 - clz(Sxy_size))) >> 1; // round size up to the nearest power of 2\n"
"int initial_step = initial_id;\n"
"\n"
"// corrections\n"
"initial_id -= (initial_id > 0) ? 1 : 0;\n"
"\n"
"--Sxy_size;\n"
"\n"
"int id = initial_id;\n"
"int stp = initial_step;\n"
"int col = -1;\n"
"\n"
"while (stp) {\n"
"stp >>= 1; // divide step by 2\n"
"col = Sxy_row[min(id, Sxy_size)].column;\n"
"stp = (Szy_elem.column == col) ? 0 : stp;\n"
"id += (Szy_elem.column < col) ? -stp : stp;\n"
"}\n"
"\n"
"if (col == Szy_elem.column) {\n"
"Sxy_row[min(id, Sxy_size)].value += w * Sxz_elements[k].value * Szy_elem.value;\n"
"}\n"
"\n"
"}\n"
,
"__kernel\n"
"void Finalization_CombineMatrices(\n"
"global	const	float* matrix1,\n"
"global	const	float* matrix2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float pid,\n"
"local			float* localBuffer,\n"
"global			float* out,\n"
"global			float* extra)\n"
"{\n"
"#define j get_local_id(0)\n"
"int columnSize = 0; // number of significant cells\n"
"\n"
"local float* scoringRow = localBuffer;\n"
"local int* columnSizes = (local int*)localBuffer;\n"
"\n"
"//	local int* rowSizes = columnSizes + seq2Length + 1;\n"
"// seq1 can be longer than the workgroup size\n"
"//	for (int q = j; q <= seq1Length; q += get_local_size(0)) {\n"
"//		rowSizes[q] = 0;\n"
"//	}\n"
"\n"
"// column offset in global memory\n"
"int ij = jaggedIndex_float(j, -j, seq1Length + 1);			// seek to -j'th row\n"
"\n"
"float diagScore = 0;\n"
"float score = 0;\n"
"\n"
"for (int iter = 0, iterationsCount = seq1Length + seq2Length + 1; iter < iterationsCount; ++iter)\n"
"{\n"
"int i = iter - j;\n"
"float leftScore = (j == 0 || j > seq2Length) ? 0 : scoringRow[j-1]; // calculate scoring matrix cells\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) // if i and j are legal\n"
"{\n"
"float a = 0;\n"
"\n"
"if (i > 0 && j > 0) {\n"
"a = matrix1[ij];\n"
"float b = matrix2[ij];\n"
"a = sqrt(HMM_WEIGHT * a * a + (1 - HMM_WEIGHT) * b * b);					// store posterior cell in a\n"
"\n"
"score = _max3(a + diagScore, leftScore, score);		// store current score\n"
"columnSize += (a >= POSTERIOR_CUTOFF) ? 1 : 0;\n"
"}\n"
"\n"
"out[ij] = a;\n"
"scoringRow[j] = score;\n"
"diagScore = leftScore;\n"
"}\n"
"\n"
"// move to the next element\n"
"ij += jagSize_float;\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"\n"
"// store number of non zero cells in columns and rows\n"
"if (j <= seq2Length)	{ columnSizes[j] = columnSize; }\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// last thread sums up column sizes and saves stuff in extra output\n"
"if (j == seq2Length) {\n"
"for (int k = 1; k < seq2Length; ++k) { columnSize += columnSizes[k]; }\n"
"\n"
"localBuffer[0] = score;\n"
"localBuffer[1] = columnSize;\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"if (extra != 0) {\n"
"extra[0] = score;\n"
"extra[1] = columnSize;\n"
"}\n"
"#endif\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel\n"
"void Finalization_ComputeAll(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float pid,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM		const	struct ProbabilisticParams* probParams,\n"
"PART_MEM	const	struct PartitionFunctionParams* funcParams,\n"
"local				partition_t* localBuffer,\n"
"global				partition_t* auxiliaryThreeLayers,\n"
"global				float* posterior)\n"
"{\n"
"#define j get_local_id(0)\n"
"\n"
"global float* probPosterior = auxiliaryThreeLayers;\n"
"global float* funcPosterior = posterior;\n"
"\n"
"Partition_ComputeAll(seq1, seq2, seq1Length, seq2Length, scheduling, funcParams,\n"
"localBuffer, auxiliaryThreeLayers, funcPosterior);\n"
"\n"
"Probabilistic_ComputeAll(seq1, seq2, seq1Length, seq2Length, scheduling, probParams,\n"
"(local float*)localBuffer, probPosterior, probPosterior);\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"if (j == 0) {\n"
"probPosterior[0] = 0;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"// calculate combined posterior matrix\n"
"Finalization_CombineMatrices(\n"
"probPosterior,\n"
"funcPosterior,\n"
"seq1Length,\n"
"seq2Length,\n"
"pid,\n"
"(local float*)localBuffer,\n"
"posterior,\n"
"probPosterior); // extra output will be placed at probPosterior\n"
"\n"
"// calculate distance on the basis of score\n"
"if (j == 0) {\n"
"posterior[0] = 1.0f - ((local float*)localBuffer)[0] / min(seq1Length, seq2Length);\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#undef j\n"
"}\n"
,
"__kernel\n"
"void Finalization_CombineMatrices_long(\n"
"global	const	float* matrix1,\n"
"global	const	float* matrix2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float pid,\n"
"local			float* localBuffer,\n"
"global			float* out,\n"
"global			float* extra,\n"
"global			float* scoringColumn)\n"
"{\n"
"#define j_offset get_local_id(0)\n"
"int columnSize = 0; // number of significant cells\n"
"\n"
"local float* scoringRow = localBuffer;\n"
"local int* columnSizes = (local int*)localBuffer;\n"
"\n"
"// reset scoring column\n"
"for (int i = get_local_id(0); i <= seq1Length; i += get_local_size(0)) {\n"
"scoringColumn[i] = 0;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"float score = 0;\n"
"\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"float diagScore = 0;\n"
"score = 0;\n"
"\n"
"// column offset in global memory\n"
"int ij = jaggedIndex_float(j, -j_offset, seq1Length + 1);			// seek to -j'th row\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset;\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"// if i and j are legal\n"
"float leftScore = (j_offset == 0) ? scoringColumn[i] : scoringRow[j_offset - 1];\n"
"\n"
"float a = 0;\n"
"\n"
"if (i > 0 && j > 0) {\n"
"a = matrix1[ij];\n"
"float b = matrix2[ij];\n"
"a = sqrt(HMM_WEIGHT * a * a + (1 - HMM_WEIGHT) * b * b); // store posterior cell in a\n"
"\n"
"score = _max3(a + diagScore, leftScore, score);		// store current score\n"
"columnSize += (a >= POSTERIOR_CUTOFF) ? 1 : 0;\n"
"}\n"
"\n"
"out[ij] = a;\n"
"scoringRow[j_offset] = score;\n"
"diagScore = leftScore;\n"
"\n"
"// last thread in a group stores column\n"
"if (j_offset == get_local_size(0) - 1) {\n"
"scoringColumn[i] = score;\n"
"}\n"
"}\n"
"\n"
"// move to the next element\n"
"ij += jagSize_float;\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"// store number of non zero cells in columns and rows\n"
"columnSizes[get_local_id(0)] = columnSize;\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// last saves stuff in extra output\n"
"if (j - get_local_size(0) == seq2Length) {\n"
"\n"
"columnSize = 0;\n"
"for (int k = 0; k < get_local_size(0); ++k) {\n"
"columnSize += columnSizes[k];\n"
"}\n"
"\n"
"localBuffer[0] = score;\n"
"localBuffer[1] = columnSize;\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"if (extra != 0) {\n"
"extra[0] = score;\n"
"extra[1] = columnSize;\n"
"}\n"
"#endif\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel\n"
"void Finalization_ComputeAll_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float pid,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM		const	struct ProbabilisticParams* probParams,\n"
"PART_MEM	const	struct PartitionFunctionParams* funcParams,\n"
"local				partition_t* localBuffer,\n"
"global				partition_t* auxiliaryThreeLayers,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"#define j get_local_id(0)\n"
"\n"
"global float* probPosterior = auxiliaryThreeLayers;\n"
"global float* funcPosterior = posterior;\n"
"\n"
"Partition_ComputeAll_long(seq1, seq2, seq1Length, seq2Length, scheduling, funcParams,\n"
"localBuffer, auxiliaryThreeLayers, funcPosterior, columns);\n"
"\n"
"Probabilistic_ComputeAll_long(seq1, seq2, seq1Length, seq2Length, scheduling, probParams,\n"
"(local float*)localBuffer, probPosterior, probPosterior, columns);\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"if (j == 0) {\n"
"probPosterior[0] = 0;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"// calculate combined posterior matrix\n"
"Finalization_CombineMatrices_long(\n"
"probPosterior,\n"
"funcPosterior,\n"
"seq1Length,\n"
"seq2Length,\n"
"pid,\n"
"(local float*)localBuffer,\n"
"posterior,\n"
"probPosterior, // extra output will be placed at probPosterior\n"
"columns);\n"
"\n"
"// calculate distance on the basis of score\n"
"if (j == 0) {\n"
"posterior[0] = 1.0f - ((local float*)localBuffer)[0] / min(seq1Length, seq2Length);\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#undef j\n"
"}\n"
"\n"
,
"__constant const float LOG_ZERO = -2e20f;\n"
"__constant const float LOG_ONE = 0.0f;\n"
"__constant const float LOG_UNDERFLOW_THRESHOLD = LOG_UNDERFLOW;\n"
"\n"
"#ifdef USE_TAYLOR_HMM\n"
"#define fast_exp(x)		array_exp(x)\n"
"#define log_exp_1(x)	array_log_exp_1(x)\n"
"#else\n"
"#define fast_exp(x)		native_exp(x)\n"
"#define log_exp_1(x)	native_log(native_exp(x) + 1)\n"
"#endif\n"
"\n"
"\n"
"#ifdef USE_TAYLOR_HMM\n"
"\n"
"typedef double4 exp_vector_type;\n"
"typedef double exp_scalar_type;\n"
"\n"
"__constant const exp_vector_type exp_05_0	= (exp_vector_type)(0.03254409303190190000, 0.16280432765779600000, 0.49929760485974900000, 0.99995149601363700000);\n"
"__constant const exp_vector_type exp_1_05	= (exp_vector_type)(0.01973899026052090000, 0.13822379685007000000, 0.48056651562365000000, 0.99326940370383500000);\n"
"__constant const exp_vector_type exp_2_1	= (exp_vector_type)(0.00940528203591384000, 0.09414963667859410000, 0.40825793595877300000, 0.93933625499130400000);\n"
"__constant const exp_vector_type exp_4_2	= (exp_vector_type)(0.00217245711583303000, 0.03484829428350620000, 0.22118199801337800000, 0.67049462206469500000);\n"
"__constant const exp_vector_type exp_8_4	= (exp_vector_type)(0.00012398771025456900, 0.00349155785951272000, 0.03727721426017900000, 0.17974997741536900000);\n"
"__constant const exp_vector_type exp_16_8	= (exp_vector_type)(0.00000051741713416603, 0.00002721456879608080, 0.00053418601865636800, 0.00464101989351936000);\n"
"__constant exp_scalar_type free_05_0	= 0.99999925508501600000;\n"
"__constant exp_scalar_type free_1_05	= 0.99906756856399500000;\n"
"__constant exp_scalar_type free_2_1		= 0.98369508190545300000;\n"
"__constant exp_scalar_type free_4_2		= 0.83556950223398500000;\n"
"__constant exp_scalar_type free_8_4		= 0.33249299994217400000;\n"
"__constant exp_scalar_type free_16_8	= 0.01507447981459420000;\n"
"\n"
"float array_exp(float x)\n"
"{\n"
"if (x > 0) return native_exp(x);\n"
"if (x <= -16) return 0;\n"
"\n"
"exp_vector_type coeffs = (x > -2)\n"
"? ( (x > -0.5) ? exp_05_0 : ( (x > -1) ? exp_1_05 : exp_2_1) )\n"
": ( (x > -8) ? ( (x > -4) ? exp_4_2 : exp_8_4 ) : exp_16_8 );\n"
"\n"
"exp_scalar_type free = (x > -2)\n"
"? ( (x > -0.5) ? free_05_0 : ( (x > -1) ? free_1_05 : free_2_1) )\n"
": ( (x > -8) ? ( (x > -4) ? free_4_2 : free_8_4 ) : free_16_8 );\n"
"\n"
"return (((coeffs.s0 * x + coeffs.s1) * x + coeffs.s2) * x + coeffs.s3) * x + free;\n"
"}\n"
"\n"
"__constant const float4 less_1 = (float4)(-0.009350833524763f, 0.130659527668286f, 0.498799810682272f, 0.693203116424741f);\n"
"__constant const float4 less_2 = (float4)(-0.014532321752540f, 0.139942324101744f, 0.495635523139337f, 0.692140569840976f);\n"
"__constant const float4 less_4 = (float4)(-0.004605031767994f, 0.063427417320019f, 0.695956496475118f, 0.514272634594009f);\n"
"__constant const float4 more_4 = (float4)(-0.000458661602210f, 0.009695946122598f, 0.930734667215156f, 0.168037164329057f);\n"
"\n"
"float array_log_exp_1(float x)\n"
"{\n"
"float4 coeffs = (float4) (\n"
"((x) <= 2.50f)\n"
"? (((x) <= 1.00f) ? less_1 : less_2)\n"
": (((x) <= 4.50f) ? less_4 : more_4) );\n"
"\n"
"return mad(mad(mad(coeffs.s0, x, coeffs.s1), x, coeffs.s2), x, coeffs.s3);\n"
"}\n"
"#endif\n"
"\n"
"\n"
"void logOfSum_local(local float *x, float y){\n"
"float d = *x;\n"
"float s;\n"
"if (d < y) { s = d; }\n"
"else { s = y; y = d; }\n"
"d = y - s;\n"
"*x = (s <= LOG_ZERO || d >= LOG_UNDERFLOW_THRESHOLD) ? y : log_exp_1(d) + s;\n"
"}\n"
"\n"
"void logOfSum_private(private float *x, float y){\n"
"#define d *x\n"
"float s;\n"
"if (d < y) { s = d; }\n"
"else { s = y; y = d; }\n"
"d = y - s;\n"
"*x = (s <= LOG_ZERO || d >= LOG_UNDERFLOW_THRESHOLD) ? y : log_exp_1(d) + s;\n"
"#undef d\n"
"}\n"
"\n"
"float logOfSum(float x, float y)\n"
"{\n"
"#define d x\n"
"float s;\n"
"if (d < y) { s = d; }\n"
"else { s = y; y = d; }\n"
"d = y - s;\n"
"return (s <= LOG_ZERO || d >= LOG_UNDERFLOW_THRESHOLD) ? y : log_exp_1(d) + s;\n"
"#undef d\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"\n"
,
"// This file contains some memory transfers definitions\n"
"#pragma once\n"
"\n"
"#define swap(x,y,temp) {temp = (x); x = (y); y = (temp);}\n"
"\n"
"#define __memcpy(dst,src,num,offset) { for (int i = offset; i < offset + num; i++) { dst[i] = src[i]; } }\n"
"\n"
"#define __memset(ptr,value,num) { for (int i = 0; i < num; i++) { (ptr)[i] = (value); } }\n"
"\n"
"void memcpy(local float* dst, global float* src, int num)\n"
"{\n"
"__memcpy(dst, src, num, 0);\n"
"}\n"
"\n"
"void group_multiply(global float* ptr, int size, float value)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { ptr[i] *= value; }\n"
"}\n"
"\n"
"void group_copy_g2g(global float* dst, global float* src, int size)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { dst[i] = src[i]; }\n"
"}\n"
"\n"
"void group_copy_g2l(local float* dst, global float* src, int size)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { dst[i] = src[i]; }\n"
"}\n"
"\n"
"void group_copy_l2g(global float* dst, local float* src, int size)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { dst[i] = src[i]; }\n"
"}\n"
"\n"
"void group_weightcopy_g2l(local float* dst, global float* src, float weight, int size)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { dst[i] = src[i] * weight; }\n"
"}\n"
"\n"
"void group_weightcopy_l2g(global float* dst, local float* src, float weight, int size)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0)) { dst[i] = src[i] * weight; }\n"
"}\n"
"\n"
"void group_memset(global float* dst, int size, float value)\n"
"{\n"
"for (int i = get_local_id(0); i < size; i += get_local_size(0))\n"
"{\n"
"dst[i] = value;\n"
"}\n"
"}\n"
"\n"
"\n"
,
"\n"
"\n"
"__kernel __attribute__((reqd_work_group_size(STRIPE_COUNT * STRIPE_LENGTH, 1, 1)))\n"
"void MultipleConsistency_Postprocess(\n"
"global		const	unsigned int* lengths,\n"
"global		const	unsigned int* sparseOffsets,\n"
"global				struct RelaxationTask* tasks,\n"
"global				float* Sxy,								// output sparse matrix to be filtered\n"
"const	float posteriorCutoff,\n"
"struct RelaxationSchedule schedule,\n"
"local				struct cell_type* Sxy_row)\n"
"{\n"
"#define localId get_local_id(0)\n"
"#define taskId get_group_id(0)\n"
"#define stripeId (get_local_id(0) / STRIPE_LENGTH)\n"
"\n"
"int stripeOffset = get_local_id(0) % STRIPE_LENGTH;\n"
"\n"
"local index_type Sxy_sizes[STRIPE_COUNT];\n"
"local int x, y;\n"
"local int length_x_inc;\n"
"\n"
"if (localId == 0) {\n"
"x = tasks[taskId].i;\n"
"y = tasks[taskId].j;\n"
"length_x_inc = lengths[x] + 1;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy indices to auxiliary array\n"
"Sxy += sparseOffsets[x * schedule.numSeqs + y];	// output sparse matrix\n"
"\n"
"float sumW = *Sxy;\n"
"\n"
"int end_ix = _ceilround(length_x_inc - 1, STRIPE_COUNT) + 1;\n"
"\n"
"// filter rows and update sizes\n"
"for (int ix = 1 + stripeId; ix < end_ix; ix += STRIPE_COUNT) {\n"
"\n"
"if (stripeOffset == 0) {\n"
"Sxy_sizes[stripeId] = ix < length_x_inc ? sparse_rowSizes(Sxy)[ix] : 0;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"int Sxy_size = Sxy_sizes[stripeId];\n"
"\n"
"// store locally Sxy rows being processed and weight them\n"
"for (int y_lane = stripeOffset; y_lane < Sxy_size; y_lane += STRIPE_LENGTH) {\n"
"sparse_cell(Sxy, length_x_inc, ix, y_lane)->value /= sumW;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"#undef localId\n"
"#undef taskId\n"
"#undef stripeId\n"
"}\n"
"\n"
"\n"
"__kernel __attribute__((reqd_work_group_size(STRIPE_COUNT * STRIPE_LENGTH, 1, 1)))\n"
"void MultipleConsistency_PostprocessFiltered(\n"
"global		const	unsigned int* lengths,\n"
"global		const	unsigned int* sparseOffsets,\n"
"global				struct RelaxationTask* tasks,\n"
"global				float* Sxy,								// output sparse matrix to be filtered\n"
"const	float posteriorCutoff,\n"
"struct RelaxationSchedule schedule,\n"
"local				struct cell_type* Sxy_row)\n"
"{\n"
"#define localId get_local_id(0)\n"
"#define taskId get_group_id(0)\n"
"#define stripeId (get_local_id(0) / STRIPE_LENGTH)\n"
"\n"
"int stripeOffset = get_local_id(0) % STRIPE_LENGTH;\n"
"\n"
"local index_type Sxy_sizes[STRIPE_COUNT];\n"
"local int x, y;\n"
"local int length_x_inc;\n"
"local int totalCells;\n"
"local int stripeRowIndex;\n"
"int maxRow = 0;\n"
"\n"
"\n"
"if (localId == 0) {\n"
"x = tasks[taskId].i;\n"
"y = tasks[taskId].j;\n"
"length_x_inc = lengths[x] + 1;\n"
"totalCells = 0;\n"
"stripeRowIndex = 0;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy indices to auxiliary array\n"
"Sxy += sparseOffsets[x * schedule.numSeqs + y];	// output sparse matrix\n"
"float sumW = *Sxy;\n"
"\n"
"Sxy_row += stripeId * schedule.sparseWidth;\n"
"\n"
"int end_ix = _ceilround(length_x_inc - 1, STRIPE_COUNT) + 1;\n"
"\n"
"// filter rows and update sizes\n"
"for (int ix = 1 + stripeId; ix < end_ix; ix += STRIPE_COUNT) {\n"
"\n"
"if (stripeOffset == 0) {\n"
"Sxy_sizes[stripeId] = ix < length_x_inc ? sparse_rowSizes(Sxy)[ix] : 0;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"int Sxy_size = Sxy_sizes[stripeId];\n"
"\n"
"// store locally Sxy rows being processed and weight them\n"
"for (int y_lane = stripeOffset; y_lane < Sxy_size; y_lane += STRIPE_LENGTH) {\n"
"Sxy_row[y_lane] = sparse_data(Sxy, length_x_inc)[\n"
"(stripeId == 0 ? stripeRowIndex : sparse_rowIndices(Sxy, length_x_inc)[ix]) + y_lane];\n"
"// use source index to get row as destination indices are altered\n"
"Sxy_row[y_lane].value = Sxy_row[y_lane].value / sumW;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// filter output rows\n"
"if (stripeOffset == 0 && ix < length_x_inc) {\n"
"int accepted = 0;\n"
"for (int c = 0; c < Sxy_size; ++c) {\n"
"if (Sxy_row[c].value >= posteriorCutoff) {\n"
"Sxy_row[accepted++] = Sxy_row[c];\n"
"}\n"
"}\n"
"\n"
"sparse_rowSizes(Sxy)[ix] = Sxy_sizes[stripeId] = accepted;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"Sxy_size = Sxy_sizes[stripeId];\n"
"\n"
"// update offsets of Sxy rows being processed\n"
"if (localId == 0) {\n"
"int idx = sparse_rowIndices(Sxy, length_x_inc)[ix];\n"
"stripeRowIndex = (ix + STRIPE_COUNT < length_x_inc) ? sparse_rowIndices(Sxy, length_x_inc)[ix + STRIPE_COUNT] : 0;\n"
"\n"
"for (int j = 0, row = ix; j < STRIPE_COUNT && row < length_x_inc; ++j, ++row) {\n"
"idx += Sxy_sizes[j];\n"
"maxRow = max(maxRow, Sxy_sizes[j]);\n"
"\n"
"if (row < length_x_inc - 1) {\n"
"sparse_rowIndices(Sxy, length_x_inc)[row + 1] = idx; // update index of next row\n"
"}\n"
"}\n"
"totalCells = idx;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// store output Sxy rows\n"
"for (int y_lane = stripeOffset; y_lane < Sxy_size; y_lane += STRIPE_LENGTH) {\n"
"*sparse_cell(Sxy, length_x_inc, ix, y_lane) = Sxy_row[y_lane];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"// store new number of cells\n"
"if (localId == 0) {\n"
"*sparse_numCells(Sxy) = totalCells;\n"
"sparse_rowSizes(Sxy)[0] = maxRow;\n"
"sparse_rowIndices(Sxy, length_x_inc)[0] = totalCells;\n"
"}\n"
"\n"
"#undef localId\n"
"#undef taskId\n"
"#undef stripeId\n"
"}\n"
"\n"
,
"#pragma once\n"
"\n"
"#if defined SELECTIVITY_SUM\n"
"#define SELECTIVITY_FUN(x,y) ((x)+(y))\n"
"#elif defined SELECTIVITY_MAX\n"
"#define SELECTIVITY_FUN(x,y) max((x),(y))\n"
"#elif defined SELECTIVITY_MIN\n"
"#define SELECTIVITY_FUN(x,y) min((x),(y))\n"
"#elif defined SELECTIVITY_AVG\n"
"#define SELECTIVITY_FUN(x,y) (((x)+(y))/2)\n"
"#endif\n"
"\n"
"#if defined FILTER_VERTICAL\n"
"#define filter(a,b,x)			(((x) <= (a)) ? 1.0f : 0)\n"
"#elif defined FILTER_LINEAR\n"
"#define filter(a,b,x)			((a) * (x) + (b))\n"
"#elif defined FILTER_HOMOGRAPHIC\n"
"#define filter(a,b,x)			((1 - (x)) / ((a) * (x) + 1))\n"
"#elif defined FILTER_TRIANGLE_MIDPASS\n"
"#define filter(a,b,x)			min((a)*(x), -(a)*(x)+(a))\n"
"#endif\n"
"\n"
"__kernel __attribute__((reqd_work_group_size(STRIPE_COUNT * STRIPE_LENGTH, 1, 1)))\n"
"void MultipleConsistency_RelaxOld(\n"
"global	const	unsigned int* lengths,\n"
"global	const	unsigned int* sortingMap,\n"
"global	const	float* seqsWeights,\n"
"global	const	float* distances,\n"
"global	const	unsigned int* sparseOffsets,\n"
"global			struct RelaxationTask* tasks,\n"
"global			float* Sxy,						// output sparse matrix\n"
"global	const	float* Sxz,\n"
"global	const	float* Szy,\n"
"struct RelaxationSchedule schedule,\n"
"local			struct cell_type* Sxz_elements)\n"
"{\n"
"// make some aliases\n"
"#define localId get_local_id(0)\n"
"#define taskId get_group_id(0)\n"
"#define stripeId (get_local_id(0) >> STRIPE_LENGTH_LOG2)\n"
"\n"
"int stripeOffset = get_local_id(0) % STRIPE_LENGTH;\n"
"// store stripeOffset and sparseWidth in the same variable\n"
"\n"
"local index_type auxSizes[STRIPE_COUNT];\n"
"local int x, y;\n"
"local float wx_wy;\n"
"local int length_x_inc;\n"
"local int aux1;\n"
"local int aux2;\n"
"local float sumW;\n"
"\n"
"// distribution parameters\n"
"local float a;\n"
"local float b;\n"
"\n"
"if (localId == 0) {\n"
"x = tasks[taskId].i;\n"
"y = tasks[taskId].j;\n"
"length_x_inc = lengths[x] + 1;\n"
"aux1 = sparseOffsets[x * schedule.numSeqs + y];\n"
"sumW = *(Sxy + aux1);\n"
"\n"
"a = distances[0 * schedule.numSeqs + 0];\n"
"b = distances[1 * schedule.numSeqs + 1];\n"
"\n"
"// calculate adjusted selfweight sw = 1 + (sw - 1) * acceptedFraction\n"
"wx_wy = 1.0f + (distances[schedule.numSeqs * schedule.numSeqs - 1] - 1.0f) * (float)tasks[taskId].acceptedCount / a;\n"
"wx_wy *= seqsWeights[x] + seqsWeights[y];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"Sxy += aux1;	// output sparse matrix\n"
"\n"
"// initialize local pointers and make some aliases\n"
"local struct cell_type* Sxy_row = Sxz_elements + get_local_size(0);\n"
"\n"
"// add taks-specific offsets\n"
"Sxz_elements += stripeId * STRIPE_LENGTH;\n"
"Sxy_row += stripeId * schedule.sparseWidth;\n"
"\n"
"int end_ix = _ceilround(length_x_inc - 1, STRIPE_COUNT) + 1;\n"
"int seed = tasks[taskId].seed;\n"
"\n"
"// contribution from all sequences other than x and y (uz - unsorted z index)\n"
"for (int uz = schedule.sequenceBegin; uz < schedule.sequenceEnd; ++uz) {\n"
"\n"
"int z = sortingMap[uz];\n"
"float w = SELECTIVITY_FUN(distances[x * schedule.numSeqs + z], distances[y * schedule.numSeqs + z]);\n"
"\n"
"seed = parkmiller(seed); // get next random number\n"
"w = filter(a, b, w);\n"
"w = ((float)seed) * RND_MAX_INV - w;\n"
"\n"
"if (z == x || z == y || w >= 0) {\n"
"continue;\n"
"}\n"
"\n"
"w = seqsWeights[z] / (wx_wy);\n"
"\n"
"if (localId == 0) {\n"
"aux1 = sparseOffsets[x * schedule.numSeqs + z];\n"
"aux2 = sparseOffsets[z * schedule.numSeqs + y];\n"
"sumW += w;\n"
"}\n"
"\n"
"int length_z_inc = lengths[z] + 1;\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"Sxz += aux1;\n"
"Szy += aux2;\n"
"\n"
"// iterate over Sxy rows\n"
"for (int ix = 1 + stripeId; ix < end_ix; ix += STRIPE_COUNT) {\n"
"\n"
"// store Sxz sizes in auxiliary local array\n"
"if (stripeOffset == 0) {\n"
"auxSizes[stripeId] = ix < length_x_inc ? sparse_rowSizes(Sxz)[ix] : 0;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"int Sxy_size = ix < length_x_inc ? sparse_rowSizes(Sxy)[ix] : 0;\n"
"int rowIndex = ix < length_x_inc ? sparse_rowIndices(Sxz, length_x_inc)[ix] : 0;\n"
"int Sxz_size = auxSizes[stripeId];\n"
"\n"
"// get maximum Sxz length\n"
"if (localId == 0) {\n"
"index_type currentMax = 0;\n"
"#pragma unroll STRIPE_COUNT\n"
"for (int j = 0; j < STRIPE_COUNT; ++j) {\n"
"currentMax = max(auxSizes[j], currentMax);\n"
"}\n"
"auxSizes[0] = currentMax;\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// store locally Sxy rows being processed\n"
"for (int y_lane = stripeOffset; y_lane < Sxy_size; y_lane += STRIPE_LENGTH) {\n"
"Sxy_row[y_lane] = *sparse_cell(Sxy, length_x_inc, ix, y_lane);\n"
"}\n"
"\n"
"--Sxy_size;\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// process consecutive stripes of Sxz\n"
"for (int z_lane = stripeOffset, end = _ceilround(auxSizes[0], STRIPE_LENGTH); z_lane < end; z_lane += STRIPE_LENGTH) {\n"
"\n"
"// store Sxz stripes locally\n"
"if (z_lane < Sxz_size) {\n"
"Sxz_elements[stripeOffset] = sparse_data(Sxz, length_x_inc)[rowIndex + z_lane];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// perform multiplication\n"
"if (Sxy_size >= 0)\n"
"{\n"
"int end_k = min(STRIPE_LENGTH, Sxz_size - z_lane + stripeOffset); // start from last index\n"
"for (int k = 0; k < end_k ; ++k) {\n"
"\n"
"int Szy_size = sparse_rowSizes(Szy)[Sxz_elements[k].column];\n"
"for (int y_lane = stripeOffset; y_lane < Szy_size; y_lane += STRIPE_LENGTH) {\n"
"\n"
"private struct cell_type Szy_elem = *sparse_cell(Szy, length_z_inc, Sxz_elements[k].column, y_lane);\n"
"int lo = 0;\n"
"int hi = Sxy_size;\n"
"\n"
"while (lo < hi) {\n"
"int mid = hadd(lo, hi); // same as (lo + hi)>> 1;\n"
"if (Sxy_row[mid].column >= Szy_elem.column) {\n"
"hi = mid;\n"
"} else {\n"
"lo = mid + 1;\n"
"}\n"
"}\n"
"\n"
"if (Sxy_row[hi].column == Szy_elem.column) {\n"
"Sxy_row[hi].value += w * Sxz_elements[k].value * Szy_elem.value;\n"
"}\n"
"}\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"} // end stripe loop\n"
"\n"
"++Sxy_size;\n"
"// copy Sxy rows back to global\n"
"for (int y_lane = stripeOffset; y_lane < Sxy_size; y_lane += STRIPE_LENGTH) {\n"
"sparse_cell(Sxy, length_x_inc, ix, y_lane)->value = Sxy_row[y_lane].value;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"} // end row iteration\n"
"\n"
"Sxz -= aux1;\n"
"Szy -= aux2;\n"
"} // end relaxing sequence iteration\n"
"\n"
"if (localId == 0) {\n"
"*Sxy = sumW;\n"
"tasks[taskId].seed = seed;\n"
"}\n"
"\n"
"#undef localId\n"
"#undef taskId\n"
"#undef stripeId\n"
"}\n"
"\n"
,
"#pragma once\n"
"\n"
"#ifndef LONG_KERNELS\n"
"\n"
"__kernel void MultipleFinalization_Combine(\n"
"global		const	unsigned int*	lengths,\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localBuffer)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"global float* posterior = posteriorMatrices + tasks[taskId].offset_Pij;\n"
"\n"
"// calculate combined posterior matrix\n"
"Finalization_CombineMatrices(\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"tasks[taskId].pid,\n"
"localBuffer,\n"
"posterior,\n"
"(global float*)0);\n"
"\n"
"// calculate distance on the basis of score\n"
"if (get_local_id(0) == 0) {\n"
"posterior[0] = 1.0f - localBuffer[0] / min(lengths[task_i], lengths[task_j]);\n"
"tasks[taskId].numCells = localBuffer[1];\n"
"}\n"
"}\n"
"\n"
"#else\n"
"\n"
"__kernel void MultipleFinalization_Combine_long(\n"
"global		const	unsigned int*	lengths,\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localBuffer,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"global float* posterior = posteriorMatrices + tasks[taskId].offset_Pij;\n"
"\n"
"// calculate combined posterior matrix\n"
"Finalization_CombineMatrices_long(\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"tasks[taskId].pid,\n"
"localBuffer,\n"
"posterior,\n"
"(global float*)0,\n"
"columns + taskId * columnsOffset);\n"
"\n"
"// calculate distance on the basis of score\n"
"if (get_local_id(0) == 0) {\n"
"posterior[0] = 1.0f - localBuffer[0] / min(lengths[task_i], lengths[task_j]);\n"
"tasks[taskId].numCells = localBuffer[1];\n"
"}\n"
"}\n"
"\n"
"#endif\n"
,
"#pragma once\n"
"\n"
"#ifndef LONG_KERNELS\n"
"\n"
"__kernel void MultiplePartition_ComputeForward(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			PartitionFunctionParams* globalParams MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				partition_t*	localMemory)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PARTITION_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct PartitionFunctionParams localParams;\n"
"local struct PartitionFunctionParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct PartitionFunctionParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Partition_ComputeForward(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij);\n"
"}\n"
"\n"
"\n"
"__kernel void MultiplePartition_ComputeReverse(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			PartitionFunctionParams* globalParams MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				partition_t*	localMemory)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PARTITION_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct PartitionFunctionParams localParams;\n"
"local struct PartitionFunctionParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct PartitionFunctionParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Partition_ComputeReverse(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"posteriorMatrices + tasks[taskId].offset_Pij);\n"
"}\n"
"\n"
"\n"
"\n"
"#else\n"
"\n"
"\n"
"__kernel void MultiplePartition_ComputeForward_long(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			PartitionFunctionParams* globalParams MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				partition_t*	localMemory,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PARTITION_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct PartitionFunctionParams localParams;\n"
"local struct PartitionFunctionParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct PartitionFunctionParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Partition_ComputeForward_long(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"columns + taskId * columnsOffset);\n"
"}\n"
"\n"
"\n"
"__kernel void MultiplePartition_ComputeReverse_long(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			PartitionFunctionParams* globalParams MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				partition_t*	localMemory,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PARTITION_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct PartitionFunctionParams localParams;\n"
"local struct PartitionFunctionParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct PartitionFunctionParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Partition_ComputeReverse_long(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"columns + taskId * columnsOffset);\n"
"}\n"
"\n"
"#endif\n"
,
"#pragma once\n"
"\n"
"#ifndef LONG_KERNELS\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeAll(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeAll(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij);\n"
"}\n"
"\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeForward(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeForwardMatrix(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij);\n"
"}\n"
"\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeBackward(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"int length_i = lengths[task_i];\n"
"int length_j = lengths[task_j];\n"
"\n"
"int layerSize = jaggedSize(length_j + 1, length_i + 1, jagSize_float);\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeBackwardMatrix(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"length_i,\n"
"length_j,\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij + layerSize) ;\n"
"}\n"
"\n"
"__kernel void MultipleProbabilistic_Combine(\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"int length_i = lengths[task_i];\n"
"int length_j = lengths[task_j];\n"
"\n"
"int layerSize = jaggedSize(length_j + 1, length_i + 1, jagSize_float);\n"
"\n"
"global float* forward	= auxiliaryMatrices + tasks[taskId].offset_Aij;\n"
"global float* backward = auxiliaryMatrices + tasks[taskId].offset_Aij + layerSize;\n"
"\n"
"local float localTotal;\n"
"\n"
"// restore original values\n"
"if (get_local_id(0) == 0) {\n"
"int lastIndex = jaggedIndex_float(length_j, length_i, length_i + 1);\n"
"localTotal = (forward[0] + backward[lastIndex]) / 2;\n"
"forward[0] = LOG_ZERO;\n"
"backward[lastIndex] = globalParams->initial[0];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float total = localTotal;\n"
"\n"
"Probabilistic_ComputePosteriorMatrix(\n"
"length_i,\n"
"length_j,\n"
"total,\n"
"scheduling,\n"
"forward,\n"
"backward,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij);\n"
"}\n"
"\n"
"\n"
"#else\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeAll_long(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeAll_long(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"columns + taskId * columnsOffset);\n"
"}\n"
"\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeForward_long(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeForwardMatrix_long(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"lengths[task_i],\n"
"lengths[task_j],\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij,\n"
"columns + taskId * columnsOffset);\n"
"}\n"
"\n"
"__kernel void MultipleProbabilistic_ComputeBackward_long(\n"
"global		const	char*			sequences,\n"
"global		const	unsigned int*	offsets,\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				float*			posteriorMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling,\n"
"local				float*			localMemory,\n"
"global				float*			columns,\n"
"const	int				columnsOffset)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"int length_i = lengths[task_i];\n"
"int length_j = lengths[task_j];\n"
"\n"
"int layerSize = jaggedSize(length_j + 1, length_i + 1, jagSize_float);\n"
"\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"// copy scoring matrices from global to local memory\n"
"local struct ProbabilisticParams localParams;\n"
"local struct ProbabilisticParams *params = &localParams;\n"
"\n"
"event_t eventCopied = async_work_group_copy(\n"
"(local int*)params, (global int*)globalParams, sizeof(struct ProbabilisticParams) / sizeof(int), 0);\n"
"wait_group_events(1, &eventCopied);\n"
"#else\n"
"#define params globalParams\n"
"#endif\n"
"\n"
"Probabilistic_ComputeBackwardMatrix_long(\n"
"sequences + offsets[task_i],\n"
"sequences + offsets[task_j],\n"
"length_i,\n"
"length_j,\n"
"scheduling,\n"
"params,\n"
"localMemory,\n"
"auxiliaryMatrices + tasks[taskId].offset_Aij + layerSize,\n"
"columns + taskId * columnsOffset) ;\n"
"}\n"
"\n"
"__kernel void MultipleProbabilistic_Combine_long(\n"
"global		const	unsigned int*	lengths,\n"
"global		const	struct			ProbabilisticParams* globalParams MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"global				float*			auxiliaryMatrices,\n"
"global				struct			PosteriorTask* tasks,\n"
"struct			PosteriorSchedule scheduling)\n"
"{\n"
"// execute computations\n"
"int taskId = get_group_id(0);\n"
"int task_i = tasks[taskId].i;\n"
"int task_j = tasks[taskId].j;\n"
"int length_i = lengths[task_i];\n"
"int length_j = lengths[task_j];\n"
"\n"
"int layerSize = jaggedSize(length_j + 1, length_i + 1, jagSize_float);\n"
"\n"
"global float* forward	= auxiliaryMatrices + tasks[taskId].offset_Aij;\n"
"global float* backward = auxiliaryMatrices + tasks[taskId].offset_Aij + layerSize;\n"
"\n"
"local float localTotal;\n"
"\n"
"// restore original values\n"
"if (get_local_id(0) == 0) {\n"
"int lastIndex = jaggedIndex_float(length_j, length_i, length_i + 1);\n"
"localTotal = (forward[0] + backward[lastIndex]) / 2;\n"
"forward[0] = LOG_ZERO;\n"
"backward[lastIndex] = globalParams->initial[0];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float total = localTotal;\n"
"\n"
"Probabilistic_ComputePosteriorMatrix_long(\n"
"length_i,\n"
"length_j,\n"
"total,\n"
"scheduling,\n"
"forward,\n"
"backward,\n"
"forward);\n"
"}\n"
"\n"
"#endif\n"
,
"#pragma once\n"
"\n"
"#ifndef LONG_KERNELS\n"
"\n"
"__kernel void MultipleSparse_Compute(\n"
"global	const	unsigned int* lengths,\n"
"global			float* sparseMatrices,\n"
"global			float* posteriorMatrices,\n"
"global	const	struct PosteriorTask* tasks,\n"
"struct PosteriorSchedule scheduling,\n"
"local			float* localMemory,\n"
"int				transpose)\n"
"{\n"
"int taskId = get_group_id(0);\n"
"int i = tasks[taskId].i;\n"
"int j = tasks[taskId].j;\n"
"\n"
"if (!transpose) {\n"
"SparseMatrix_Generate(\n"
"lengths[i],\n"
"lengths[j],\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"localMemory,\n"
"sparseMatrices + tasks[taskId].offset_Aij);\n"
"} else {\n"
"SparseMatrix_GenerateTranspose(\n"
"lengths[i],\n"
"lengths[j],\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"localMemory,\n"
"sparseMatrices+ tasks[taskId].offset_Aij);\n"
"}\n"
"}\n"
"\n"
"#else\n"
"\n"
"__kernel void MultipleSparse_Compute_long(\n"
"global	const	unsigned int* lengths,\n"
"global			float* sparseMatrices,\n"
"global			float* posteriorMatrices,\n"
"global	const	struct PosteriorTask* tasks,\n"
"struct PosteriorSchedule scheduling,\n"
"local			float* localMemory,\n"
"int				transpose)\n"
"{\n"
"int taskId = get_group_id(0);\n"
"int i = tasks[taskId].i;\n"
"int j = tasks[taskId].j;\n"
"\n"
"if (!transpose) {\n"
"SparseMatrix_Generate_verylong(\n"
"lengths[i],\n"
"lengths[j],\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"localMemory,\n"
"sparseMatrices + tasks[taskId].offset_Aij);\n"
"} else {\n"
"SparseMatrix_GenerateTranspose_long(\n"
"lengths[i],\n"
"lengths[j],\n"
"posteriorMatrices + tasks[taskId].offset_Pij,\n"
"localMemory,\n"
"sparseMatrices+ tasks[taskId].offset_Aij);\n"
"}\n"
"}\n"
"\n"
"#endif\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"\n"
"#ifdef USE_FLOAT_PARTITION\n"
"#define jagSize jagSize_float\n"
"#define TERM_GAP_OPEN	1.0f\n"
"#define TERM_GAP_EXTEND 1.0f\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"\n"
"typedef float partition_t;\n"
"#else\n"
"#define jagSize jagSize_double\n"
"#define TERM_GAP_OPEN	1.0\n"
"#define TERM_GAP_EXTEND 1.0\n"
"#define PROB_LO 0.001\n"
"#define PROB_HI 1.0\n"
"\n"
"typedef double partition_t;\n"
"#endif\n"
"\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"partition_t termGapOpen;\n"
"partition_t termGapExtend;\n"
"partition_t gapOpen;\n"
"partition_t gapExt;\n"
"partition_t subMatrix[26 * 26];\n"
"};\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeForward(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* aux,\n"
"global				partition_t* forward)\n"
"{\n"
"\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local partition_t* Zf = aux + width * 1;\n"
"local partition_t* Ze_Zf_Zm_prev = aux + width * 2;\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, -j, height, jagSize);		// 0 - current\n"
"int ij_left = jaggedIndex(j - 1, -j, height, jagSize);	// 1 - left\n"
"\n"
"// initializations\n"
"if (j <= seq2Length) {\n"
"Zf[j] = 0;\n"
"Ze[j] = (j > 0);\n"
"}\n"
"\n"
"if (j == 0) {\n"
"Ze_Zf_Zm_prev[0] = 1; // next elements are not needed\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"partition_t Zm_up = (j == 0);\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter)\n"
"{\n"
"int i = iter - j;	// calculate i coordinate starting from row 0\n"
"\n"
"partition_t Ze_Zf_Zm_diag;\n"
"partition_t Ze_left;\n"
"\n"
"// utilise elements from Ze and Ze_Zf_Zm_prev arrays\n"
"if (j > 0 && j < width) {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_prev[j - 1];\n"
"Ze_left = Ze[j - 1];\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// update Ze_Zf_Zm_prev array\n"
"if (j < width) {\n"
"Ze_Zf_Zm_prev[j] = Ze[j] + Zf[j] + Zm_up;\n"
"}\n"
"\n"
"// initializations for first column\n"
"if (j == 0) {\n"
"if (i == 1) {\n"
"Zm_up = 0;\n"
"Zf[0] = 1;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) // if indices are legal\n"
"{\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf\n"
"Zf[j] = Zm_up * ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN) +\n"
"Zf[j] * ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND);\n"
"\n"
"Zm_up = Ze_Zf_Zm_diag * params->subMatrix[id]; // current Zm_s0, future Zm_up\n"
"Ze_Zf_Zm_diag = Zm[ij_left]; // use variable to temporarily store Zm_left\n"
"\n"
"// update Ze\n"
"Ze[j] = Ze_Zf_Zm_diag * ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN) +\n"
"Ze_left * ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND);\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"ij_cur += jagSize; // move to next row\n"
"ij_left += jagSize;\n"
"}\n"
"\n"
"//store the sum of zm zf ze (m,n)s in zm's 0,0 th position\n"
"if (j == seq2Length) {\n"
"Zm[0] = Ze[j] + Zf[j] + Zm_up;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"}\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* aux,\n"
"global	const		partition_t* forward,\n"
"global				float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"int j_rev = seq2Length - j;\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"local partition_t* Zf = aux + 1 * width;\n"
"local partition_t* Zm = aux + 2 * width;\n"
"local partition_t* Ze_Zf_Zm_prev = aux + 3 * width;\n"
"partition_t forward0 = forward[0];\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex_float(j, seq1Length + j_rev, height);					// s0 - current\n"
"int ij_diag = jaggedIndex_float(j + 1, seq1Length + 1 + j_rev, height);			// s1 - diagonal\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_rev, height, jagSize);	// diagonal\n"
"\n"
"if (j < width) {\n"
"Zm[j] = (j_rev == 0); // 0 ... 0 1\n"
"Ze[j] = (j_rev > 0); // 1 ... 1 0\n"
"Zf[j] = 0;\n"
"}\n"
"\n"
"if (j_rev == 0) {\n"
"Ze_Zf_Zm_prev[seq2Length] = 1;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_rev - iter;\n"
"\n"
"partition_t Ze_Zf_Zm = 0;\n"
"partition_t Zm_diag = 0;\n"
"partition_t Ze_diag = 0;\n"
"\n"
"if (j_rev > 0) {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_prev[j + 1];\n"
"Zm_diag = Zm[j + 1];\n"
"Ze_diag = Ze[j + 1];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// initialize last column\n"
"if (j_rev > 0) {\n"
"Ze_Zf_Zm_prev[j] = Ze[j] + Zf[j] + Zm[j];\n"
"}\n"
"\n"
"else if (j_rev == 0)  {\n"
"if (i == seq1Length - 1) {\n"
"Zf[j] = 1;\n"
"Zm[j] = 0;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length)\n"
"{\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"Zf[j] = Zm[j] * GAP_OPEN + Zf[j] * GAP_EXTEND;\n"
"Ze[j] = Zm_diag * GAP_OPEN + Ze_diag * GAP_EXTEND;\n"
"\n"
"// use Zm_diag to store probability\n"
"#define probability Zm_diag\n"
"probability = (forward0 == 0) ? 0 : forward[ij_forward_diag] * Ze_Zf_Zm / forward0;\n"
"\n"
"Zm[j] = Ze_Zf_Zm * params->subMatrix[id]; // multiply by score\n"
"\n"
"posterior[ij_diag] = (probability <= PROB_HI && probability >= PROB_LO) ? probability : 0;\n"
"#undef probability\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= jagSize_float;\n"
"ij_diag -= jagSize_float;\n"
"ij_forward_diag -= jagSize;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* auxiliaryRows,\n"
"global				partition_t* auxiliaryLayer,\n"
"global				float* posterior)\n"
"{\n"
"Partition_ComputeForward(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer);\n"
"Partition_ComputeReverse(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior);\n"
"}\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"#define TERM_GAP_OPEN	LOG_ONE\n"
"#define TERM_GAP_EXTEND LOG_ONE\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"typedef float partition_t;\n"
"\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"float termGapOpen;\n"
"float termGapExtend;\n"
"float gapOpen;\n"
"float gapExt;\n"
"float subMatrix[26 * 26];\n"
"};\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeForward(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global				float* forward)\n"
"{\n"
"\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local float* Zf = aux + width * 1;\n"
"local float* Ze_Zf_Zm_prev = aux + width * 2;\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, -j, height, JAG_FLOAT);			// 0 - current\n"
"int ij_left = jaggedIndex(j - 1, -j, height, JAG_FLOAT);	// 1 - left\n"
"\n"
"// initializations\n"
"if (j <= seq2Length) {\n"
"Zf[j] = LOG_ZERO;\n"
"Ze[j] = (j > 0) ? LOG_ONE : LOG_ZERO;\n"
"}\n"
"\n"
"if (j == 0) {\n"
"Ze_Zf_Zm_prev[0] = LOG_ONE;		// next elements are not needed\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float Zm_up = (j == 0) ? LOG_ONE : LOG_ZERO;\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter)\n"
"{\n"
"int i = iter - j;	// calculate i coordinate starting from row 0\n"
"\n"
"float Ze_Zf_Zm_diag;\n"
"float Ze_left;\n"
"\n"
"// utilise elements from Ze and Ze_Zf_Zm_prev arrays\n"
"if (j > 0 && j < width) {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_prev[j - 1];\n"
"Ze_left = Ze[j - 1];\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// update Ze_Zf_Zm_prev array\n"
"if (j < width) {\n"
"float t = Ze[j];\n"
"logOfSum_private(&t, Zf[j]);\n"
"logOfSum_private(&t, Zm_up);\n"
"Ze_Zf_Zm_prev[j] = t;\n"
"}\n"
"\n"
"// initializations for first column\n"
"if (j == 0) {\n"
"if (i == 1) {\n"
"Zm_up = LOG_ZERO;\n"
"Zf[0] = LOG_ONE;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) { // if indices are legal\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf\n"
"float t = Zm_up + ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Zf[j] + ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND));\n"
"Zf[j] = t;\n"
"\n"
"Zm_up = Ze_Zf_Zm_diag + params->subMatrix[id]; // current Zm_s0, future Zm_up\n"
"Ze_Zf_Zm_diag = Zm[ij_left]; // use variable to temporarily store Zm_left\n"
"\n"
"// update Ze\n"
"t = Ze_Zf_Zm_diag + ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Ze_left + ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND));\n"
"Ze[j] = t;\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"ij_cur += JAG_FLOAT; // move to next row\n"
"ij_left += JAG_FLOAT;\n"
"}\n"
"\n"
"//store the sum of zm zf ze (m,n)s in zm's 0,0 th position\n"
"if (j == seq2Length) {\n"
"float t = Ze[j];\n"
"logOfSum_private(&t, Zf[j]);\n"
"logOfSum_private(&t, Zm_up);\n"
"Zm[0] = t;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"}\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global	const		float* forward,\n"
"global				float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"int j_rev = seq2Length - j;\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"local float* Zf = aux + 1 * width;\n"
"local float* Zm = aux + 2 * width;\n"
"local float* Ze_Zf_Zm_prev = aux + 3 * width;\n"
"float forward0 = forward[0];\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, seq1Length + j_rev, height, JAG_FLOAT);						// current\n"
"int ij_diag = jaggedIndex(j + 1, seq1Length + 1 + j_rev, height, JAG_FLOAT);			// diagonal\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_rev, height, JAG_FLOAT);	// diagonal\n"
"\n"
"if (j < width) {\n"
"Zm[j] = (j_rev == 0) ? LOG_ONE : LOG_ZERO; // 0 ... 0 1\n"
"Ze[j] = (j_rev > 0)  ? LOG_ONE : LOG_ZERO; // 1 ... 1 0\n"
"Zf[j] = LOG_ZERO;\n"
"}\n"
"\n"
"if (j_rev == 0) {\n"
"Ze_Zf_Zm_prev[seq2Length] = LOG_ONE;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_rev - iter;\n"
"\n"
"float Ze_Zf_Zm = LOG_ZERO;\n"
"float Zm_diag = LOG_ZERO;\n"
"float Ze_diag = LOG_ZERO;\n"
"\n"
"if (j_rev > 0) {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_prev[j + 1];\n"
"Zm_diag = Zm[j + 1];\n"
"Ze_diag = Ze[j + 1];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// initialize last column\n"
"if (j_rev > 0) {\n"
"float t = Ze[j];\n"
"logOfSum_private(&t, Zf[j]);\n"
"logOfSum_private(&t, Zm[j]);\n"
"Ze_Zf_Zm_prev[j] = t;\n"
"}\n"
"\n"
"else if (j_rev == 0)  {\n"
"if (i == seq1Length - 1) {\n"
"Zf[j] = LOG_ONE;\n"
"Zm[j] = LOG_ZERO;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length)\n"
"{\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"float t = Zm[j] + GAP_OPEN;\n"
"logOfSum_private(&t, Zf[j] + GAP_EXTEND);\n"
"Zf[j] = t;\n"
"\n"
"t = Zm_diag + GAP_OPEN;\n"
"logOfSum_private(&t, Ze_diag + GAP_EXTEND);\n"
"Ze[j] = t;\n"
"\n"
"// use t to store probability\n"
"t = (forward0 == LOG_ZERO) ? 0 : fast_exp(forward[ij_forward_diag] + Ze_Zf_Zm - forward0);\n"
"Zm[j] = Ze_Zf_Zm + params->subMatrix[id]; // multiply by score\n"
"posterior[ij_diag] = (t <= PROB_HI && t >= PROB_LO) ? t : 0;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= JAG_FLOAT;\n"
"ij_diag -= JAG_FLOAT;\n"
"ij_forward_diag -= JAG_FLOAT;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryLayer,\n"
"global				float* posterior)\n"
"{\n"
"Partition_ComputeForward(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer);\n"
"Partition_ComputeReverse(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior);\n"
"}\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"#define TERM_GAP_OPEN	LOG_ONE\n"
"#define TERM_GAP_EXTEND LOG_ONE\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"typedef float partition_t;\n"
"\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"float termGapOpen;\n"
"float termGapExtend;\n"
"float gapOpen;\n"
"float gapExt;\n"
"float subMatrix[26 * 26];\n"
"};\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeForward(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global				float* forward)\n"
"{\n"
"\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local float* Ze_Zf_Zm_prev = aux + width;\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, -j, height, JAG_FLOAT);			// 0 - current\n"
"int ij_left = jaggedIndex(j - 1, -j, height, JAG_FLOAT);	// 1 - left\n"
"\n"
"// initializations\n"
"if (j <= seq2Length) {\n"
"Ze[j] = (j > 0) ? LOG_ONE : LOG_ZERO;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float Zm_up = (j == 0) ? LOG_ONE : LOG_ZERO;\n"
"float Zf_up = LOG_ZERO;\n"
"float Ze_Zf_Zm_diag = LOG_ZERO;\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter) {\n"
"int i = iter - j;	// calculate i coordinate starting from row 0\n"
"\n"
"float Ze_left;\n"
"bool legal = i >= 0 && i <= seq1Length && j <= seq2Length;\n"
"\n"
"// initializations for first column\n"
"if (j == 0 && i == 1) {\n"
"Zm_up = LOG_ZERO;\n"
"Zf_up = LOG_ONE;\n"
"}\n"
"\n"
"// utilise elements from Ze array and update Ze_Zf_Zm_prev array\n"
"if (legal && j > 0) {\n"
"Ze_left = Ze[j - 1];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) { // if indices are legal\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf_up\n"
"float t = Zm_up + ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Zf_up + ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND));\n"
"Zf_up = t;\n"
"\n"
"// update Zm\n"
"Zm_up = Ze_Zf_Zm_diag + params->subMatrix[id]; // now Zm_cur, future Zm_up\n"
"\n"
"// update Ze\n"
"t = Zm[ij_left] + ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Ze_left + ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND));\n"
"Ze[j] = t;\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_prev[j - 1];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) {\n"
"float t = Ze[j];\n"
"logOfSum_private(&t, Zf_up);\n"
"logOfSum_private(&t, Zm_up);\n"
"Ze_Zf_Zm_prev[j] = t;\n"
"}\n"
"\n"
"ij_cur += JAG_FLOAT; // move to next row\n"
"ij_left += JAG_FLOAT;\n"
"}\n"
"\n"
"//store the sum of zm Zf_up ze (m,n)s in zm's 0,0 th position\n"
"if (j == seq2Length) {\n"
"Zm[0] = Ze_Zf_Zm_prev[j];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"}\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global	const		float* forward,\n"
"global				float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"int j_rev = seq2Length - j;\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"local float* Zm = aux + 1 * width;\n"
"local float* Ze_Zf_Zm_prev = aux + 2 * width;\n"
"float forward0 = forward[0];\n"
"float Zf = LOG_ZERO;\n"
"float Ze_Zf_Zm = LOG_ZERO;\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, seq1Length + j_rev, height, JAG_FLOAT);						// current\n"
"int ij_diag = jaggedIndex(j + 1, seq1Length + 1 + j_rev, height, JAG_FLOAT);			// diagonal\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_rev, height, JAG_FLOAT);	// diagonal\n"
"\n"
"if (j < width) {\n"
"Zm[j] = (j_rev == 0) ? LOG_ONE : LOG_ZERO; // 0 ... 0 1\n"
"Ze[j] = (j_rev > 0)  ? LOG_ONE : LOG_ZERO; // 1 ... 1 0\n"
"}\n"
"\n"
"if (j_rev == 0) {\n"
"Ze_Zf_Zm_prev[seq2Length] = LOG_ONE;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_rev - iter;\n"
"\n"
"float Zm_diag = LOG_ZERO;\n"
"float Ze_diag = LOG_ZERO;\n"
"\n"
"if (j_rev > 0) {\n"
"Zm_diag = Zm[j + 1];\n"
"Ze_diag = Ze[j + 1];\n"
"} else if (i == seq1Length - 1)  { // initialisation of last row and column\n"
"Zf = LOG_ONE;\n"
"Zm[j] = LOG_ZERO;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"float t = Zm[j] + GAP_OPEN;\n"
"logOfSum_private(&t, Zf + GAP_EXTEND);\n"
"Zf = t;\n"
"\n"
"t = Zm_diag + GAP_OPEN;\n"
"logOfSum_private(&t, Ze_diag + GAP_EXTEND);\n"
"Ze[j] = t;\n"
"\n"
"// use t to store probability\n"
"t = (forward0 == LOG_ZERO) ? 0 : fast_exp(forward[ij_forward_diag] + Ze_Zf_Zm - forward0);\n"
"Zm[j] = Ze_Zf_Zm + params->subMatrix[id]; // multiply by score\n"
"posterior[ij_diag] = (t <= PROB_HI && t >= PROB_LO) ? t : 0;\n"
"}\n"
"\n"
"if (j_rev > 0) { Ze_Zf_Zm = Ze_Zf_Zm_prev[j + 1]; }\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// initialize last column\n"
"if (j_rev > 0) {\n"
"float t = Ze[j];\n"
"logOfSum_private(&t, Zf);\n"
"logOfSum_private(&t, Zm[j]);\n"
"Ze_Zf_Zm_prev[j] = t;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= JAG_FLOAT;\n"
"ij_diag -= JAG_FLOAT;\n"
"ij_forward_diag -= JAG_FLOAT;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryLayer,\n"
"global				float* posterior)\n"
"{\n"
"Partition_ComputeForward(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer);\n"
"Partition_ComputeReverse(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior);\n"
"}\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"#define TERM_GAP_OPEN	LOG_ONE\n"
"#define TERM_GAP_EXTEND LOG_ONE\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"typedef float partition_t;\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"float termGapOpen;\n"
"float termGapExtend;\n"
"float gapOpen;\n"
"float gapExt;\n"
"float subMatrix[26 * 26];\n"
"};\n"
"\n"
"__kernel\n"
"void Partition_ComputeForward_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global				float* forward,\n"
"global				float* columns)\n"
"{\n"
"\n"
"#define j_offset get_local_id(0)\n"
"\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local float* Zf = aux + get_local_size(0) * 1;\n"
"local float* Ze_Zf_Zm_prev = aux + get_local_size(0) * 2;\n"
"\n"
"#define Ze_column columns\n"
"global float* Ze_Zf_Zm_column = Ze_column + height;\n"
"\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"float Zm_up;\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"// column offset in global memory\n"
"int ij_cur = jaggedIndex(j, -j_offset, height, JAG_FLOAT);		// seek to -j'th row\n"
"int ij_left = jaggedIndex(j - 1, -j_offset, height, JAG_FLOAT);\n"
"\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// initialization\n"
"if (j <= seq2Length) {\n"
"Zf[j_offset] = LOG_ZERO;\n"
"Ze[j_offset] = (j > 0) ? LOG_ONE : LOG_ZERO;\n"
"}\n"
"\n"
"if (j == 0) {\n"
"Ze_Zf_Zm_prev[0] = LOG_ONE; // next elements are not needed\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"Zm_up = (j == 0) ? LOG_ONE : LOG_ZERO;\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset;	// calculate i coordinate starting from row 0\n"
"\n"
"float Ze_Zf_Zm_diag;\n"
"float Ze_left;\n"
"\n"
"// utilise elements from Ze and Ze_Zf_Zm_prev arrays\n"
"if (j > 0 && j < width && i >= 0 && i <= seq1Length) {\n"
"if (j_offset == 0) {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_column[i];\n"
"Ze_left = Ze_column[i];\n"
"} else {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_prev[j_offset - 1];\n"
"Ze_left = Ze[j_offset - 1] ;\n"
"}\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// update Ze_Zf_Zm_prev array\n"
"if (j < width) {\n"
"float t = Ze[j_offset];\n"
"logOfSum_private(&t, Zf[j_offset]);\n"
"logOfSum_private(&t, Zm_up);\n"
"Ze_Zf_Zm_prev[j_offset] = t;\n"
"}\n"
"\n"
"// initializations for first column\n"
"if (j == 0) {\n"
"if (i == 1) {\n"
"Zm_up = LOG_ZERO;\n"
"Zf[0] = LOG_ONE;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) // if indices are legal\n"
"{\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf\n"
"float t = Zm_up + ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Zf[j_offset] + ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND));\n"
"Zf[j_offset] = t;\n"
"\n"
"Zm_up = Ze_Zf_Zm_diag + params->subMatrix[id]; // current Zm_s0, future Zm_up\n"
"Ze_Zf_Zm_diag = Zm[ij_left]; // use variable to temporarily store Zm_left\n"
"\n"
"// update Ze\n"
"t = Ze_Zf_Zm_diag + ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Ze_left + ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND));\n"
"Ze[j_offset] = t;\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"\n"
"// last thread in a group stores column\n"
"if (j_offset == get_local_size(0) - 1) {\n"
"Ze_column[i] = Ze[j_offset];\n"
"Ze_Zf_Zm_column[i] = Ze_Zf_Zm_prev[j_offset];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur += JAG_FLOAT; // move to next row\n"
"ij_left += JAG_FLOAT;\n"
"}\n"
"}\n"
"\n"
"//store the sum of zm zf ze (m,n)s in zm's 0,0 th position\n"
"if (j - get_local_size(0) == seq2Length) {\n"
"float t = Ze[j_offset];\n"
"logOfSum_private(&t, Zf[j_offset]);\n"
"logOfSum_private(&t, Zm_up);\n"
"Zm[0] = t;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"#undef j_offset\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global	const		float* forward,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"int j_offset = get_local_id(0);\n"
"#define j_offset_rev (get_local_size(0) - get_local_id(0) - 1)\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"float forward0 = forward[0];\n"
"local float* Zf = aux + 1 * get_local_size(0);\n"
"local float* Zm = aux + 2 * get_local_size(0);\n"
"local float* Ze_Zf_Zm_prev = aux + 3 * get_local_size(0);\n"
"\n"
"#define Ze_column columns\n"
"global float* Zm_column = Ze_column + height;\n"
"global float* Ze_Zf_Zm_column = Ze_column + 2 * height;\n"
"\n"
"int j = seq2Length - j_offset_rev;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j -= get_local_size(0)) {\n"
"\n"
"int j_rev = seq2Length - j;\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, seq1Length + j_offset_rev, height, JAG_FLOAT);\n"
"int ij_diag = jaggedIndex(j + 1, seq1Length + 1 + j_offset_rev, height, JAG_FLOAT);\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_offset_rev, height, JAG_FLOAT);\n"
"\n"
"if (j >= 0) {\n"
"Zm[j_offset] = (j_rev == 0) ? LOG_ONE : LOG_ZERO; // 0 ... 0 1\n"
"Ze[j_offset] = (j_rev > 0) ? LOG_ONE : LOG_ZERO; // 1 ... 1 0\n"
"Zf[j_offset] = LOG_ZERO;\n"
"}\n"
"\n"
"if (j_rev == 0)  {\n"
"Ze_Zf_Zm_prev[j_offset] = LOG_ONE;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_offset_rev - iter;\n"
"\n"
"float Ze_Zf_Zm = LOG_ZERO;\n"
"float Zm_diag = LOG_ZERO;\n"
"float Ze_diag = LOG_ZERO;\n"
"\n"
"if (j_rev > 0 && i >= 0 && i <= seq1Length) { // except last column\n"
"if (j_offset_rev == 0) {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_column[i];\n"
"Ze_diag = Ze_column[i];\n"
"Zm_diag = Zm_column[i];\n"
"} else {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_prev[j_offset + 1];\n"
"Zm_diag = Zm[j_offset + 1];\n"
"Ze_diag = Ze[j_offset + 1];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (j_rev > 0 && j >= 0) {\n"
"float t = Ze[j_offset];\n"
"logOfSum_private(&t, Zf[j_offset]);\n"
"logOfSum_private(&t, Zm[j_offset]);\n"
"Ze_Zf_Zm_prev[j_offset] = t;\n"
"}\n"
"else if (j_rev == 0)  {  // initialize last column\n"
"if (i == seq1Length - 1) {\n"
"Zf[j_offset] = LOG_ONE;\n"
"Zm[j_offset] = LOG_ZERO;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (j >= 0 && i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"float t = Zm[j_offset] + GAP_OPEN;\n"
"logOfSum_private(&t, Zf[j_offset] + GAP_EXTEND);\n"
"Zf[j_offset] = t;\n"
"\n"
"t = Zm_diag + GAP_OPEN;\n"
"logOfSum_private(&t, Ze_diag + GAP_EXTEND);\n"
"Ze[j_offset] = t;\n"
"\n"
"// use t to store probability\n"
"t = (forward0 == LOG_ZERO) ? 0 : fast_exp(forward[ij_forward_diag] + Ze_Zf_Zm - forward0);\n"
"Zm[j_offset] = Ze_Zf_Zm + params->subMatrix[id]; // multiply by score\n"
"posterior[ij_diag] = (t <= PROB_HI && t >= PROB_LO) ? t : 0;\n"
"}\n"
"\n"
"// first thread in a group stores column\n"
"if (j_offset == 0) {\n"
"Ze_column[i] = Ze[0];\n"
"Zm_column[i] = Zm[0];\n"
"Ze_Zf_Zm_column[i] = Ze_Zf_Zm_prev[0];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= JAG_FLOAT;\n"
"ij_diag -= JAG_FLOAT;\n"
"ij_forward_diag -= JAG_FLOAT;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef j_offset_rev\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryLayer,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"Partition_ComputeForward_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, columns);\n"
"Partition_ComputeReverse_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior, columns);\n"
"}\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"#define TERM_GAP_OPEN	LOG_ONE\n"
"#define TERM_GAP_EXTEND LOG_ONE\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"typedef float partition_t;\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"float termGapOpen;\n"
"float termGapExtend;\n"
"float gapOpen;\n"
"float gapExt;\n"
"float subMatrix[26 * 26];\n"
"};\n"
"__kernel\n"
"void Partition_ComputeForward_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global				float* forward,\n"
"global				float* columns)\n"
"{\n"
"\n"
"#define j_offset get_local_id(0)\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local float* Ze_Zf_Zm_prev = aux + get_local_size(0);\n"
"\n"
"#define Ze_column columns\n"
"global float* Ze_Zf_Zm_column = Ze_column + height;\n"
"\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"// column offset in global memory\n"
"int ij_cur = jaggedIndex(j, -j_offset, height, JAG_FLOAT);		// seek to -j'th row\n"
"int ij_left = jaggedIndex(j - 1, -j_offset, height, JAG_FLOAT);\n"
"\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// initialization\n"
"if (j <= seq2Length) {\n"
"Ze[j_offset] = (j > 0) ? LOG_ONE : LOG_ZERO;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float Zf_up = LOG_ZERO;\n"
"float Zm_up = (j == 0) ? LOG_ONE : LOG_ZERO;\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset;	// calculate i coordinate starting from row 0\n"
"\n"
"float Ze_Zf_Zm_diag;\n"
"float Ze_left;\n"
"bool legal = i >= 0 && i <= seq1Length && j <= seq2Length;\n"
"\n"
"// utilise elements from Ze and Ze_Zf_Zm_prev arrays\n"
"if (legal && j > 0) {\n"
"if (j_offset == 0) {\n"
"Ze_left = Ze_column[i];\n"
"} else {\n"
"Ze_left = Ze[j_offset - 1] ;\n"
"}\n"
"}\n"
"\n"
"// initializations for first column\n"
"if (j == 0 && i == 1) {\n"
"Zm_up = LOG_ZERO;\n"
"Zf_up = LOG_ONE;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) { // if indices are legal\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf\n"
"float t = Zm_up + ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Zf_up + ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND));\n"
"Zf_up = t;\n"
"\n"
"Zm_up = Ze_Zf_Zm_diag + params->subMatrix[id]; // current Zm_s0, future Zm_up\n"
"Ze_Zf_Zm_diag = Zm[ij_left]; // use variable to temporarily store Zm_left\n"
"\n"
"// update Ze\n"
"t = Ze_Zf_Zm_diag + ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN);\n"
"logOfSum_private(&t, Ze_left + ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND));\n"
"Ze[j_offset] = t;\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"Ze_Zf_Zm_diag = (j > 0 && j_offset == 0) ? Ze_Zf_Zm_column[i] : Ze_Zf_Zm_prev[j_offset - 1]; // load prev elements\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) {\n"
"// update Ze_Zf_Zm_prev array\n"
"float t;\n"
"t = Ze[j_offset];\n"
"logOfSum_private(&t, Zf_up);\n"
"logOfSum_private(&t, Zm_up);\n"
"Ze_Zf_Zm_prev[j_offset] = t;\n"
"\n"
"// last thread in a group stores column\n"
"if (j_offset == get_local_size(0) - 1) {\n"
"Ze_column[i] = Ze[j_offset];\n"
"Ze_Zf_Zm_column[i] = t;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur += JAG_FLOAT; // move to next row\n"
"ij_left += JAG_FLOAT;\n"
"}\n"
"}\n"
"\n"
"//store the sum of zm zf ze (m,n)s in zm's 0,0 th position\n"
"if (j - get_local_size(0) == seq2Length) {\n"
"Zm[0] = Ze_Zf_Zm_prev[j_offset];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"#undef j_offset\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* aux,\n"
"global	const		float* forward,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"int j_offset = get_local_id(0);\n"
"#define j_offset_rev (get_local_size(0) - get_local_id(0) - 1)\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"float forward0 = forward[0];\n"
"local float* Zm = aux + 1 * get_local_size(0);\n"
"local float* Ze_Zf_Zm_prev = aux + 2 * get_local_size(0);\n"
"\n"
"#define Ze_column columns\n"
"global float* Zm_column = Ze_column + height;\n"
"global float* Ze_Zf_Zm_column = Ze_column + 2 * height;\n"
"\n"
"int j = seq2Length - j_offset_rev;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j -= get_local_size(0)) {\n"
"\n"
"int j_rev = seq2Length - j;\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex(j, seq1Length + j_offset_rev, height, JAG_FLOAT);\n"
"int ij_diag = jaggedIndex(j + 1, seq1Length + 1 + j_offset_rev, height, JAG_FLOAT);\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_offset_rev, height, JAG_FLOAT);\n"
"\n"
"float Zf = LOG_ZERO;\n"
"\n"
"if (j >= 0) {\n"
"Zm[j_offset] = (j_rev == 0) ? LOG_ONE : LOG_ZERO; // 0 ... 0 1\n"
"Ze[j_offset] = (j_rev > 0) ? LOG_ONE : LOG_ZERO; // 1 ... 1 0\n"
"}\n"
"\n"
"if (j_rev == 0)  {\n"
"Ze_Zf_Zm_prev[j_offset] = LOG_ONE;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"float Ze_Zf_Zm = LOG_ZERO;\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_offset_rev - iter;\n"
"\n"
"float Zm_diag = LOG_ZERO;\n"
"float Ze_diag = LOG_ZERO;\n"
"bool legal = j >= 0 && i >= 0 && i <= seq1Length && j <= seq2Length;\n"
"\n"
"if (legal && j_rev > 0) { // except last column\n"
"if (j_offset_rev == 0) {\n"
"Ze_diag = Ze_column[i];\n"
"Zm_diag = Zm_column[i];\n"
"} else {\n"
"Zm_diag = Zm[j_offset + 1];\n"
"Ze_diag = Ze[j_offset + 1];\n"
"}\n"
"}\n"
"\n"
"if (j_rev == 0 && i == seq1Length - 1)  {  // initialize last column\n"
"Zf = LOG_ONE;\n"
"Zm[j_offset] = LOG_ZERO;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) {\n"
"\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"float t = Zm[j_offset] + GAP_OPEN;\n"
"logOfSum_private(&t, Zf + GAP_EXTEND);\n"
"Zf = t;\n"
"\n"
"t = Zm_diag + GAP_OPEN;\n"
"logOfSum_private(&t, Ze_diag + GAP_EXTEND);\n"
"Ze[j_offset] = t;\n"
"\n"
"// use t to store probability\n"
"t = (forward0 == LOG_ZERO) ? 0 : fast_exp(forward[ij_forward_diag] + Ze_Zf_Zm - forward0);\n"
"Zm[j_offset] = Ze_Zf_Zm + params->subMatrix[id]; // multiply by score\n"
"posterior[ij_diag] = (t <= PROB_HI && t >= PROB_LO) ? t : 0;\n"
"}\n"
"\n"
"Ze_Zf_Zm = (j_offset_rev == 0) ?  (j_rev == 0 ? LOG_ZERO : Ze_Zf_Zm_column[i]) : Ze_Zf_Zm_prev[j_offset + 1];\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) {\n"
"float t;\n"
"if (j_rev > 0 && j >= 0) {\n"
"t = Ze[j_offset];\n"
"logOfSum_private(&t, Zf);\n"
"logOfSum_private(&t, Zm[j_offset]);\n"
"Ze_Zf_Zm_prev[j_offset] = t;\n"
"}\n"
"\n"
"// first thread in a group stores column\n"
"if (j_offset == 0) {\n"
"Ze_column[i] = Ze[0];\n"
"Zm_column[i] = Zm[0];\n"
"Ze_Zf_Zm_column[i] = t;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= JAG_FLOAT;\n"
"ij_diag -= JAG_FLOAT;\n"
"ij_forward_diag -= JAG_FLOAT;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef j_offset_rev\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryLayer,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"Partition_ComputeForward_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, columns);\n"
"Partition_ComputeReverse_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior, columns);\n"
"}\n"
,
"#define GAP_OPEN		params->gapOpen\n"
"#define GAP_EXTEND		params->gapExt\n"
"\n"
"\n"
"#ifdef USE_FLOAT_PARTITION\n"
"#define jagSize jagSize_float\n"
"#define TERM_GAP_OPEN	1.0f\n"
"#define TERM_GAP_EXTEND 1.0f\n"
"#define PROB_LO 0.001f\n"
"#define PROB_HI 1.0f\n"
"\n"
"typedef float partition_t;\n"
"#else\n"
"#define jagSize jagSize_double\n"
"#define TERM_GAP_OPEN	1.0\n"
"#define TERM_GAP_EXTEND 1.0\n"
"#define PROB_LO 0.001\n"
"#define PROB_HI 1.0\n"
"\n"
"typedef double partition_t;\n"
"#endif\n"
"\n"
"struct PartitionFunctionParams\n"
"{\n"
"partition_t termGapOpen;\n"
"partition_t termGapExtend;\n"
"partition_t gapOpen;\n"
"partition_t gapExt;\n"
"partition_t subMatrix[26 * 26];\n"
"};\n"
"\n"
"__kernel\n"
"void Partition_ComputeForward_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* aux,\n"
"global				partition_t* forward,\n"
"global				partition_t* columns)\n"
"{\n"
"\n"
"#define j_offset get_local_id(0)\n"
"\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"#define Zm forward\n"
"#define Ze aux\n"
"local partition_t* Zf = aux + get_local_size(0) * 1;\n"
"local partition_t* Ze_Zf_Zm_prev = aux + get_local_size(0) * 2;\n"
"\n"
"#define Ze_column columns\n"
"global partition_t* Ze_Zf_Zm_column = Ze_column + height;\n"
"\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"partition_t Zm_up;\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"// column offset in global memory\n"
"int ij_cur = jaggedIndex(j, -j_offset, height, jagSize);		// seek to -j'th row\n"
"int ij_left = jaggedIndex(j - 1, -j_offset, height, jagSize);\n"
"\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// initialization\n"
"if (j <= seq2Length) {\n"
"Zf[j_offset] = 0;\n"
"Ze[j_offset] = (j > 0);\n"
"}\n"
"\n"
"if (j == 0) {\n"
"Ze_Zf_Zm_prev[0] = 1; // next elements are not needed\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"Zm_up = (j == 0);\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset;	// calculate i coordinate starting from row 0\n"
"\n"
"partition_t Ze_Zf_Zm_diag;\n"
"partition_t Ze_left;\n"
"\n"
"// utilise elements from Ze and Ze_Zf_Zm_prev arrays\n"
"if (j > 0 && j < width && i >= 0 && i <= seq1Length) {\n"
"if (j_offset == 0) {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_column[i];\n"
"Ze_left = Ze_column[i];\n"
"} else {\n"
"Ze_Zf_Zm_diag = Ze_Zf_Zm_prev[j_offset - 1];\n"
"Ze_left = Ze[j_offset - 1] ;\n"
"}\n"
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// update Ze_Zf_Zm_prev array\n"
"if (j < width) {\n"
"Ze_Zf_Zm_prev[j_offset] = Ze[j_offset] + Zf[j_offset] + Zm_up;\n"
"}\n"
"\n"
"// initializations for first column\n"
"if (j == 0) {\n"
"if (i == 1) {\n"
"Zm_up = 0;\n"
"Zf[0] = 1;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) // if indices are legal\n"
"{\n"
"// initialization of first row\n"
"if (i > 0 && j > 0) {\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// update Zf\n"
"Zf[j_offset] = Zm_up * ((j == seq2Length) ? TERM_GAP_OPEN : GAP_OPEN) +\n"
"Zf[j_offset] * ((j == seq2Length) ? TERM_GAP_EXTEND : GAP_EXTEND);\n"
"\n"
"Zm_up = Ze_Zf_Zm_diag * params->subMatrix[id]; // now Zm_cur, future Zm_up\n"
"Ze_Zf_Zm_diag = Zm[ij_left]; // use variable to temporarily store Zm_left\n"
"\n"
"// update Ze\n"
"Ze[j_offset] = Ze_Zf_Zm_diag * ((i == seq1Length) ? TERM_GAP_OPEN : GAP_OPEN) +\n"
"Ze_left * ((i == seq1Length) ? TERM_GAP_EXTEND: GAP_EXTEND);\n"
"}\n"
"\n"
"Zm[ij_cur] = Zm_up;\n"
"\n"
"// last thread in a group stores column\n"
"if (j_offset == get_local_size(0) - 1) {\n"
"Ze_column[i] = Ze[j_offset];\n"
"Ze_Zf_Zm_column[i] = Ze_Zf_Zm_prev[j_offset];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur += jagSize; // move to next row\n"
"ij_left += jagSize;\n"
"}\n"
"}\n"
"\n"
"//store the sum of zm zf ze (m,n)s in zm's 0,0 th position\n"
"if (j - get_local_size(0) == seq2Length) {\n"
"Zm[0] = Ze[j_offset] + Zf[j_offset] + Zm_up;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef Zm\n"
"#undef j_offset\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeReverse_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* aux,\n"
"global	const		partition_t* forward,\n"
"global				float* posterior,\n"
"global				partition_t* columns)\n"
"{\n"
"int j_offset = get_local_id(0);\n"
"#define j_offset_rev (get_local_size(0) - get_local_id(0) - 1)\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"\n"
"++seq1; // + 1 because of '@' character at the beginning of seq.\n"
"++seq2;\n"
"\n"
"#define Ze aux\n"
"partition_t forward0 = forward[0];\n"
"local partition_t* Zf = aux + 1 * get_local_size(0);\n"
"local partition_t* Zm = aux + 2 * get_local_size(0);\n"
"local partition_t* Ze_Zf_Zm_prev = aux + 3 * get_local_size(0);\n"
"\n"
"#define Ze_column columns\n"
"global partition_t* Zm_column = Ze_column + height;\n"
"global partition_t* Ze_Zf_Zm_column = Ze_column + 2 * height;\n"
"\n"
"int j = seq2Length - j_offset_rev;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j -= get_local_size(0)) {\n"
"\n"
"int j_rev = seq2Length - j;\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex_float(j, seq1Length + j_offset_rev, height);\n"
"int ij_diag = jaggedIndex_float(j + 1, seq1Length + 1 + j_offset_rev, height);\n"
"int ij_forward_diag = jaggedIndex(j + 1, seq1Length + 1 + j_offset_rev, height, jagSize);\n"
"\n"
"if (j >= 0) {\n"
"Zm[j_offset] = (j_rev == 0); // 0 ... 0 1\n"
"Ze[j_offset] = (j_rev > 0); // 1 ... 1 0\n"
"Zf[j_offset] = 0;\n"
"}\n"
"\n"
"if (j_rev == 0)  {\n"
"Ze_Zf_Zm_prev[j_offset] = 1;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = seq1Length + j_offset_rev - iter;\n"
"\n"
"partition_t Ze_Zf_Zm = 0;\n"
"partition_t Zm_diag = 0;\n"
"partition_t Ze_diag = 0;\n"
"\n"
"if (j_rev > 0 && i >= 0 && i <= seq1Length) { // except last column\n"
"if (j_offset_rev == 0) {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_column[i];\n"
"Ze_diag = Ze_column[i];\n"
"Zm_diag = Zm_column[i];\n"
"} else {\n"
"Ze_Zf_Zm = Ze_Zf_Zm_prev[j_offset + 1];\n"
"Zm_diag = Zm[j_offset + 1];\n"
"Ze_diag = Ze[j_offset + 1];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (j_rev > 0 && j >= 0) {\n"
"Ze_Zf_Zm_prev[j_offset] = Ze[j_offset] + Zf[j_offset] + Zm[j_offset];\n"
"}\n"
"else if (j_rev == 0)  {  // initialize last column\n"
"if (i == seq1Length - 1) {\n"
"Zf[j_offset] = 1;\n"
"Zm[j_offset] = 0;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (j >= 0 && i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"\n"
"#ifdef EXACT_REVERSE_PARTITION\n"
"if (i == 0 || j == 0) { posterior[ij_cur] = 0; } // initialization of first posterior row and column\n"
"#endif\n"
"\n"
"if (i < seq1Length && j < seq2Length) // for all elements but last row and column\n"
"{\n"
"int id = fast_mad(seq1[i], 26, seq2[j]);\n"
"\n"
"// terminal gaps may be ignored here as last row and column are not used for posterior calculation\n"
"Zf[j_offset] = Zm[j_offset] * GAP_OPEN + Zf[j_offset] * GAP_EXTEND;\n"
"Ze[j_offset] = Zm_diag * GAP_OPEN + Ze_diag * GAP_EXTEND;\n"
"\n"
"// use Zm_diag to store probability\n"
"#define probability Zm_diag\n"
"probability = (forward0 == 0) ? 0 : forward[ij_forward_diag] * Ze_Zf_Zm / forward0;\n"
"\n"
"Zm[j_offset] = Ze_Zf_Zm * params->subMatrix[id]; // multiply by score\n"
"\n"
"posterior[ij_diag] = (probability <= PROB_HI && probability >= PROB_LO) ? probability : 0;\n"
"#undef probability\n"
"}\n"
"\n"
"// first thread in a group stores column\n"
"if (j_offset == 0) {\n"
"Ze_column[i] = Ze[0];\n"
"Zm_column[i] = Zm[0];\n"
"Ze_Zf_Zm_column[i] = Ze_Zf_Zm_prev[0];\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"ij_cur -= jagSize_float;\n"
"ij_diag -= jagSize_float;\n"
"ij_forward_diag -= jagSize;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"#undef height\n"
"#undef width\n"
"#undef j_offset_rev\n"
"#undef Ze_column\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Partition_ComputeAll_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"PART_MEM	const	struct PartitionFunctionParams* params MAX_SIZE(sizeof(struct PartitionFunctionParams)),\n"
"local				partition_t* auxiliaryRows,\n"
"global				partition_t* auxiliaryLayer,\n"
"global				float* posterior,\n"
"global				partition_t* columns)\n"
"{\n"
"Partition_ComputeForward_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, columns);\n"
"Partition_ComputeReverse_long(seq1, seq2, seq1Length, seq2Length, scheduling, params, auxiliaryRows, auxiliaryLayer, posterior, columns);\n"
"}\n"
,
"struct ProbabilisticParams\n"
"{\n"
"float initial[NumMatrixTypes];						// holds the initial probabilities for each state\n"
"float trans[NumMatrixTypes * NumMatrixTypes];        // holds all state-to-state transition probabilities\n"
"float match[SymbolCountSmall * SymbolCountSmall];\n"
"float insert[SymbolCountSmall * NumMatrixTypes];\n"
"};\n"
"\n"
"#define PROBS_DIAG(j) probs->trans[fast_mad(j, depth, j)]\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeForwardMatrix(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"private				int seq1Length,\n"
"private				int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probs MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* levels,\n"
"global				float* forward)\n"
"{\n"
"int height = seq1Length + 1;\n"
"int width = seq2Length + 1;\n"
"int j = get_local_id(0);\n"
"#define layerOffset ((int)get_local_size(0))\n"
"\n"
"// calculate indices\n"
"int ij_cur = jaggedIndex_float(j, -j, height);		// 0 - current\n"
"int ij_left = jaggedIndex_float(j - 1, -j, height);	// 2 - left\n"
"\n"
"private float levels_up[NumInsertStates] = {LOG_ZERO, LOG_ZERO};\n"
"float fwd_up = LOG_ZERO;\n"
"float levels_diag = LOG_ZERO;\n"
"int c2 = j <= seq2Length ? seq2[j] : -1;\n"
"\n"
"for (int iter = 0, iterationsCount = height + width - 1; iter < iterationsCount; ++iter) {\n"
"\n"
"private float levels_left[NumInsertStates];\n"
"float fwd_cur;\n"
"float fwd_left;\n"
"\n"
"int c1;\n"
"int i = iter - j;								// calculate i coordinate\n"
"bool legal = i >= 0 && i < height && j < width;  // if i and j are legal\n"
"\n"
"if (legal) {\n"
"c1 = seq1[i];\n"
"int ixj = i * j;\n"
"\n"
"fwd_cur = (ixj == 0) ?\n"
"LOG_ZERO :\n"
"(ixj == 1 ? probs->initial[0] : levels_diag) + probs->match[fast_mad(c1, SymbolCountSmall, c2)];\n"
"fwd_left = (j > 0) ? forward[ij_left] : LOG_ZERO;\n"
"forward[ij_cur] = fwd_cur;\n"
"\n"
"// store elements in left cells, init elements in up_cells from 0th column\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_up[k] = (i == 1 && j == 0) ?\n"
"probs->initial[fast_mad(2,k,1)] + probs->insert[fast_mad(c1, depth, k)] :\n"
"levels_up[k];\n"
"\n"
"levels_left[k] = (i == 0 && j == 1) ?\n"
"probs->initial[fast_mad(2,k,2)] + probs->insert[fast_mad(c2, depth, k)] :\n"
"((j > 0) ? levels[fast_mad(layerOffset, k, j - 1)] : LOG_ZERO);\n"
"}\n"
"\n"
"levels_diag = levels[fast_mad(layerOffset, 2, j - 1)];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); // synchronize after utilising levels elements\n"
"\n"
"bool legalEx = legal && (i > 1 || j > 1); // alter 'legal 'flag\n"
"if (legalEx) {\n"
"int q = 1;\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_up[k] = (i > 0) ?\n"
"probs->insert[fast_mad(c1, depth, k)] + logOfSum(fwd_up + probs->trans[q], levels_up[k] + PROBS_DIAG(q)) :\n"
"levels_up[k];\n"
"++q;\n"
"levels_left[k] = (j > 0) ?\n"
"probs->insert[fast_mad(c2, depth, k)] + logOfSum(fwd_left + probs->trans[q], levels_left[k] + PROBS_DIAG(q)) :\n"
"levels_left[k];\n"
"++q;\n"
"}\n"
"}\n"
"\n"
"// update all local memory rows\n"
"if (legal) {\n"
"fwd_left = fwd_cur + probs->trans[0]; // use fwd_left as auxiliary\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&fwd_left, levels_up[k] + probs->trans[fast_mul(fast_mad(2,k,1), depth)]); // current cell in up layers\n"
"logOfSum_private(&fwd_left, levels_left[k] + probs->trans[fast_mul(fast_mad(2,k,2), depth)]); // current cell in left layers\n"
"levels[fast_mad(layerOffset, k, j)] = levels_left[k];\n"
"}\n"
"levels[fast_mad(layerOffset, 2, j)] = fwd_left;\n"
"}\n"
"\n"
"fwd_up = fwd_cur;\n"
"\n"
"// compute total probability from right-bottom element of forward matrix\n"
"// (corresponding fragment of backward matrix is initialized with known values\n"
"// so we can use them directly instead of backward matrix itself)\n"
"if (i == seq1Length && j == seq2Length) {\n"
"fwd_left = fwd_cur + probs->initial[0]; // use fwd_left for storing total probability\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&fwd_left, levels_up[k] + probs->initial[fast_mad(2,k,1)]); // up\n"
"logOfSum_private(&fwd_left, levels_left[k] + probs->initial[fast_mad(2,k,2)]); // components from other layers\n"
"}\n"
"\n"
"levels[0] = fwd_left;\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"forward[0] = fwd_left;\n"
"#endif\n"
"}\n"
"\n"
"ij_cur += jagSize_float; // move to next row\n"
"ij_left += jagSize_float; // move to next row\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); // synchronize after every element\n"
"}	// end vertical loop\n"
"\n"
"#undef levelWidth\n"
"#undef layerOffset\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeBackwardMatrix(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probs MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* levels,\n"
"global				float* backward)\n"
"{\n"
"int height = seq1Length + 1;\n"
"int width = seq2Length + 1;\n"
"int j = get_local_id(0);\n"
"int j_rev = seq2Length - j;\n"
"#define layerOffset ((int)get_local_size(0))\n"
"\n"
"// remember offset for each index combination\n"
"int ij_cur = jaggedIndex_float(j, seq1Length + j_rev, height);						// 0 - current\n"
"int ij_diag = jaggedIndex_float(j + 1, seq1Length + j_rev, height) + jagSize_float;	// 3 - diag\n"
"\n"
"// initialization condition\n"
"private float levels_down[NumInsertStates];\n"
"private float levels_right[NumInsertStates];\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_down[k] = j_rev == 0 ? probs->initial[fast_mad(2,k,1)] : LOG_ZERO;\n"
"levels_right[k] = j_rev == 0 ? probs->initial[fast_mad(2,k,2)] : LOG_ZERO; // this will be used for current and right\n"
"}\n"
"\n"
"float total = 0;\n"
"int c2 = j <= seq2Length ? seq2[j + 1] : -1;\n"
"\n"
"for (int iter = 0, iterationsCount = height + width - 1; iter < iterationsCount; ++iter) {\n"
"int i = seq1Length + j_rev - iter;\n"
"bool legal = i >= 0 && i <= seq1Length && j < width;\n"
"\n"
"if (legal) {\n"
"int c1 = seq1[i + 1]; // + 1 as in original code (assume seq1 buffer to be longer)\n"
"\n"
"float bwd_cur = (i == seq1Length && j == seq2Length) ? probs->initial[0] : LOG_ZERO;\n"
"float bwd_diag_ex = LOG_ZERO;\n"
"\n"
"if (i < seq1Length && j < seq2Length) {\n"
"bwd_diag_ex = backward[ij_diag] + probs->match[fast_mad(c1, SymbolCountSmall, c2)];\n"
"bwd_cur = bwd_diag_ex + probs->trans[0];\n"
"}\n"
"\n"
"// calculate total probability from left-top fragment of backward matrix\n"
"// (corresponding fragment of forward matrix is initialized with known values\n"
"// so we can use them directly instead of forward matrix itself)\n"
"if (i == 0 && j == 0) {\n"
"total = probs->initial[0] + bwd_diag_ex;\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&total, probs->initial[fast_mad(2,k,1)] + probs->insert[fast_mad(c1, depth, k)] + levels_down[k]);\n"
"logOfSum_private(&total, probs->initial[fast_mad(2,k,2)] + probs->insert[fast_mad(c2, depth, k)] + levels[fast_mad(layerOffset, k, j + 1)]);\n"
"}\n"
"}\n"
"\n"
"if (i < seq1Length) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"int q = fast_mad(2,k,1);\n"
"levels_right[k] = LOG_ZERO;\n"
"\n"
"float aux = levels_down[k] + probs->insert[fast_mad(c1, depth, k)];\n"
"levels_down[k] = (j < seq2Length) ? bwd_diag_ex + probs->trans[fast_mul(q, depth)] : LOG_ZERO;\n"
"logOfSum_private(&bwd_cur, aux + probs->trans[q]);\n"
"logOfSum_private(&levels_down[k], aux + probs->trans[fast_mad(q, depth, q)]);\n"
"}\n"
"}\n"
"\n"
"if (j < seq2Length) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"int q = fast_mad(2,k,2);\n"
"float aux = levels[fast_mad(layerOffset, k, j + 1)] + probs->insert[fast_mad(c2, depth, k)];\n"
"levels_right[k] = (i < seq1Length) ? bwd_diag_ex + probs->trans[fast_mul(q, depth)] : LOG_ZERO;\n"
"logOfSum_private(&bwd_cur, aux + probs->trans[q]);\n"
"logOfSum_private(&levels_right[k], aux + probs->trans[fast_mad(q, depth, q)]);\n"
"}\n"
"}\n"
"\n"
"backward[ij_cur] = bwd_cur;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (legal) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels[fast_mad(layerOffset, k, j)] = levels_right[k]; 		// store private cells\n"
"}\n"
"}\n"
"\n"
"ij_cur -= jagSize_float;\n"
"ij_diag -= jagSize_float;\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"} // end vertical loop\n"
"\n"
"if (j == 0) {\n"
"levels[0] = total;\n"
"#ifdef RETURN_GLOBAL\n"
"backward[jaggedIndex(seq2Length, seq1Length, height, jagSize_float)] = total;\n"
"#endif\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"#undef levelWidth\n"
"}\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputePosteriorMatrix(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float totalProb,\n"
"struct PosteriorSchedule scheduling,\n"
"global	const	float* forward,\n"
"global	const	float* backward,\n"
"global			float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"int ij = jaggedIndex_float(j, -j, seq1Length + 1);\n"
"\n"
"totalProb = (totalProb == 0) ? 1.00f : totalProb;\n"
"\n"
"for (int iter = 0, iterCount = seq1Length + seq2Length + 1; iter < iterCount; ++iter) {\n"
"int i = iter - j;\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"posterior[ij] = fast_exp(min(LOG_ONE, forward[ij] + backward[ij] - totalProb));\n"
"}\n"
"ij += jagSize_float;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeAll(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probabilistic MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryTwoLayers,\n"
"global				float* posterior)\n"
"{\n"
"#define j get_local_id(0)\n"
"int layerSize = jaggedSize(seq2Length + 1, seq1Length + 1, jagSize_float);\n"
"\n"
"// place forward matrix at layer 0, backward matrix at layer 1, treat posterior as auxiliary buffer\n"
"#define forward	(auxiliaryTwoLayers)\n"
"#define backward (auxiliaryTwoLayers + layerSize)\n"
"\n"
"Probabilistic_ComputeForwardMatrix(seq1, seq2, seq1Length, seq2Length, scheduling, probabilistic, auxiliaryRows, forward);\n"
"float totalProb = auxiliaryRows[0]; // get total probability from forward algorithm\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"Probabilistic_ComputeBackwardMatrix(seq1, seq2, seq1Length, seq2Length, scheduling, probabilistic, auxiliaryRows, backward);\n"
"totalProb = (totalProb + auxiliaryRows[0]) / 2; // get total probability from backward algorithm and combine it with forward-one\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"// restore original values\n"
"if (j == 0) {\n"
"int lastIndex = jaggedIndex_float(seq2Length, seq1Length, seq1Length + 1);\n"
"forward[0] = LOG_ZERO;\n"
"backward[lastIndex] = probabilistic->initial[0];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"// compute posterior probability\n"
"Probabilistic_ComputePosteriorMatrix(seq1Length, seq2Length, totalProb, scheduling, forward, backward, posterior);\n"
"\n"
"// This is just to return total probability. Remember to set it to 0 in host!\n"
"#ifdef RETURN_GLOBAL\n"
"if (j == 0) {\n"
"posterior[0] = totalProb;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"#undef j\n"
"#undef forward\n"
"#undef backward\n"
"}\n"
,
"struct ProbabilisticParams\n"
"{\n"
"float initial[NumMatrixTypes];						// holds the initial probabilities for each state\n"
"float trans[NumMatrixTypes * NumMatrixTypes];        // holds all state-to-state transition probabilities\n"
"float match[SymbolCountSmall * SymbolCountSmall];\n"
"float insert[SymbolCountSmall * NumMatrixTypes];\n"
"};\n"
"\n"
"#define PROBS_DIAG(j) probs->trans[fast_mad(j, depth, j)]\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeForwardMatrix_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probs MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* levels,\n"
"global				float* forward,\n"
"global				float* columns)\n"
"{\n"
"int height = seq1Length + 1;\n"
"int width = seq2Length + 1;\n"
"int j_offset = get_local_id(0);\n"
"int j = j_offset;\n"
"#define layerOffset ((int)get_local_size(0))\n"
"\n"
"for (int q = j_offset; q < depth * height; q += get_local_size(0)) {\n"
"columns[q] = LOG_ZERO;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"private float levels_up[NumInsertStates];\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"float fwd_up;\n"
"float levels_diag;\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"// calculate indices\n"
"int c2 = j <= seq2Length ? seq2[j] : -1;\n"
"int ij_cur = jaggedIndex_float(j, -j_offset, height);		// 0 - current\n"
"int ij_left = jaggedIndex_float(j - 1, -j_offset, height);	// 2 - left\n"
"\n"
"fwd_up = LOG_ZERO;\n"
"levels_diag = LOG_ZERO;\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_up[k] = LOG_ZERO;\n"
"}\n"
"\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter)\n"
"{\n"
"private float levels_left[NumInsertStates];\n"
"float fwd_cur;\n"
"float fwd_left;\n"
"\n"
"int c1;\n"
"int i = iter - j_offset;						// calculate i coordinate\n"
"bool legal = i >= 0 && i < height && j < width;  // if i and j are legal\n"
"\n"
"if (legal) {\n"
"c1 = seq1[i];\n"
"int ixj = i * j;\n"
"\n"
"fwd_cur = (ixj == 0) ?\n"
"LOG_ZERO :\n"
"(ixj == 1 ? probs->initial[0] : levels_diag) + probs->match[fast_mad(c1, SymbolCountSmall, c2)];\n"
"fwd_left = (j > 0) ? forward[ij_left] : LOG_ZERO;\n"
"forward[ij_cur] = fwd_cur;\n"
"\n"
"// store elements in left cells, init elements in up_cells from 0th column\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_up[k] = (i == 1 && j == 0) ?\n"
"probs->initial[fast_mad(2,k,1)] + probs->insert[fast_mad(c1, depth, k)] :\n"
"levels_up[k];\n"
"\n"
"levels_left[k] = (i == 0 && j == 1) ?\n"
"probs->initial[fast_mad(2,k,2)] + probs->insert[fast_mad(c2, depth, k)] :\n"
"((j > 0) ?\n"
"(j_offset == 0) ?\n"
"columns[k * height + i] :\n"
"levels[fast_mad(layerOffset, k, j_offset - 1)] : LOG_ZERO);\n"
"}\n"
"\n"
"levels_diag = (j_offset == 0) ? columns[2 * height + i] : levels[fast_mad(layerOffset, 2, j_offset - 1)];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); // synchronize after utilising levels elements\n"
"\n"
"bool legalEx = legal && (i > 1 || j > 1); // alter 'legal 'flag\n"
"if (legalEx) {\n"
"int q = 1;\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_up[k] = (i > 0) ?\n"
"probs->insert[fast_mad(c1, depth, k)] + logOfSum(fwd_up + probs->trans[q], levels_up[k] + PROBS_DIAG(q)) :\n"
"levels_up[k];\n"
"++q;\n"
"levels_left[k] = (j > 0) ?\n"
"probs->insert[fast_mad(c2, depth, k)] + logOfSum(fwd_left + probs->trans[q], levels_left[k] + PROBS_DIAG(q)) :\n"
"levels_left[k];\n"
"++q;\n"
"}\n"
"}\n"
"\n"
"// update all local memory rows\n"
"if (legal) {\n"
"fwd_left = fwd_cur + probs->trans[0]; // use fwd_left as auxiliary\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&fwd_left, levels_up[k] + probs->trans[fast_mul(fast_mad(2,k,1), depth)]); // current cell in up layers\n"
"logOfSum_private(&fwd_left, levels_left[k] + probs->trans[fast_mul(fast_mad(2,k,2), depth)]); // current cell in left layers\n"
"levels[fast_mad(layerOffset, k, j_offset)] = levels_left[k];\n"
"}\n"
"levels[fast_mad(layerOffset, 2, j_offset)] = fwd_left;\n"
"}\n"
"\n"
"fwd_up = fwd_cur;\n"
"\n"
"// compute total probability from right-bottom element of forward matrix\n"
"// (corresponding fragment of backward matrix is initialized with known values\n"
"// so we can use them directly instead of backward matrix itself)\n"
"if (i == seq1Length && j == seq2Length) {\n"
"fwd_left = fwd_cur + probs->initial[0]; // use fwd_left for storing total probability\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&fwd_left, levels_up[k] + probs->initial[fast_mad(2,k,1)]); // up\n"
"logOfSum_private(&fwd_left, levels_left[k] + probs->initial[fast_mad(2,k,2)]); // components from other layers\n"
"}\n"
"\n"
"levels[0] = fwd_left;\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"forward[0] = fwd_left;\n"
"#endif\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"if (i >= 0 && i < height && (j_offset == get_local_size(0) - 1)) {\n"
"for (int k = 0; k < NumInsertStates + 1; ++k) { // as there are three layers\n"
"columns[k * height + i] = levels[fast_mad(layerOffset, k, j_offset)];\n"
"}\n"
"}\n"
"\n"
"ij_cur += jagSize_float; // move to next row\n"
"ij_left += jagSize_float; // move to next row\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); // synchronize after every element\n"
"\n"
"}	// end vertical loop\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"#undef layerOffset\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeBackwardMatrix_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probs MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* levels,\n"
"global				float* backward,\n"
"global				float* columns)\n"
"{\n"
"int height = seq1Length + 1;\n"
"int width = seq2Length + 1;\n"
"int j_offset = get_local_id(0);\n"
"#define j_offset_rev (get_local_size(0) - get_local_id(0) - 1)\n"
"int j = seq2Length - j_offset_rev;\n"
"#define layerOffset ((int)get_local_size(0))\n"
"\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"float total = 0;\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j -= get_local_size(0)) {\n"
"int j_rev = seq2Length - j;\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"// remember offset for each index combination\n"
"int ij_cur = jaggedIndex_float(j, seq1Length + j_offset_rev, height);\n"
"int ij_diag = jaggedIndex_float(j + 1, seq1Length + j_offset_rev, height) + jagSize_float;\n"
"\n"
"// initialization condition\n"
"\n"
"private float levels_down[NumInsertStates];\n"
"private float levels_right[NumInsertStates];\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"levels_down[k] = j_rev == 0 ? probs->initial[fast_mad(2,k,1)] : LOG_ZERO;\n"
"levels_right[k] = j_rev == 0 ? probs->initial[fast_mad(2,k,2)] : LOG_ZERO; // this will be used for current and right\n"
"}\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"\n"
"int i = seq1Length + j_offset_rev - iter;\n"
"bool legal = i >= 0 && i <= seq1Length && j >= 0 && j < width;\n"
"if (legal) {\n"
"int c1 = seq1[i + 1]; // + 1 as in original code (assume seq1 buffer to be longer)\n"
"int c2 = seq2[j + 1];\n"
"\n"
"float bwd_cur = (i == seq1Length && j == seq2Length) ? probs->initial[0] : LOG_ZERO;\n"
"float bwd_diag_ex = LOG_ZERO;\n"
"\n"
"if (i < seq1Length && j < seq2Length) {\n"
"bwd_diag_ex = backward[ij_diag] + probs->match[fast_mad(c1, SymbolCountSmall, c2)];\n"
"bwd_cur = bwd_diag_ex + probs->trans[0];\n"
"}\n"
"\n"
"// calculate total probability from left-top fragment of backward matrix\n"
"// (corresponding fragment of forward matrix is initialized with known values\n"
"// so we can use them directly instead of forward matrix itself)\n"
"if (i == 0 && j == 0) {\n"
"total = probs->initial[0] + bwd_diag_ex;\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"logOfSum_private(&total, probs->initial[fast_mad(2,k,1)] + probs->insert[fast_mad(c1, depth, k)] + levels_down[k]);\n"
"logOfSum_private(&total, probs->initial[fast_mad(2,k,2)] + probs->insert[fast_mad(c2, depth, k)] +\n"
"((j_offset_rev == 0) ? columns[k * height + i] : levels[fast_mad(layerOffset, k, j_offset + 1)]));\n"
"}\n"
"}\n"
"\n"
"if (i < seq1Length) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"int q = fast_mad(2,k,1);\n"
"levels_right[k] = LOG_ZERO;\n"
"\n"
"float aux = levels_down[k] + probs->insert[fast_mad(c1, depth, k)];\n"
"levels_down[k] = (j < seq2Length) ? bwd_diag_ex + probs->trans[fast_mul(q, depth)] : LOG_ZERO;\n"
"logOfSum_private(&bwd_cur, aux + probs->trans[q]);\n"
"logOfSum_private(&levels_down[k], aux + probs->trans[fast_mad(q, depth, q)]);\n"
"}\n"
"}\n"
"\n"
"if (j < seq2Length) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"int q = fast_mad(2,k,2);\n"
"float aux =\n"
"((j_offset_rev == 0) ? columns[k * height + i] : levels[fast_mad(layerOffset, k, j_offset + 1)])\n"
"+ probs->insert[fast_mad(c2, depth, k)];\n"
"\n"
"levels_right[k] = (i < seq1Length) ? bwd_diag_ex + probs->trans[fast_mul(q, depth)] : LOG_ZERO;\n"
"logOfSum_private(&bwd_cur, aux + probs->trans[q]);\n"
"logOfSum_private(&levels_right[k], aux + probs->trans[fast_mad(q, depth, q)]);\n"
"}\n"
"}\n"
"\n"
"backward[ij_cur] = bwd_cur;\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// first thread in a group stores column\n"
"if (legal) {\n"
"for (int k = 0; k < NumInsertStates; ++k) {\n"
"if (j_offset == 0 ) {\n"
"columns[k * height + i] = levels_right[k];\n"
"} else {\n"
"levels[fast_mad(layerOffset, k, j_offset)] = levels_right[k]; 		// store private cells\n"
"}\n"
"}\n"
"}\n"
"\n"
"ij_cur -= jagSize_float;\n"
"ij_diag -= jagSize_float;\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"} // end vertical loop\n"
"}\n"
"\n"
"if (total != 0) {\n"
"levels[0] = total;\n"
"#ifdef RETURN_GLOBAL\n"
"backward[jaggedIndex(seq2Length, seq1Length, height, jagSize_float)] = total;\n"
"#endif\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"#undef levelWidth\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputePosteriorMatrix_long(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"float totalProb,\n"
"struct PosteriorSchedule scheduling,\n"
"global	const	float* forward,\n"
"global	const	float* backward,\n"
"global			float* posterior)\n"
"{\n"
"int j_offset = get_local_id(0);\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"totalProb = (totalProb == 0) ? 1.00f : totalProb;\n"
"\n"
"int j = j_offset;\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int ij = jaggedIndex_float(j, -j_offset, seq1Length + 1);\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset;\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"posterior[ij] = fast_exp(min(LOG_ONE, forward[ij] + backward[ij] - totalProb));\n"
"}\n"
"ij += jagSize_float;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"\n"
"__kernel\n"
"void Probabilistic_ComputeAll_long(\n"
"global		const	char* seq1,\n"
"global		const	char* seq2,\n"
"int seq1Length,\n"
"int seq2Length,\n"
"struct PosteriorSchedule scheduling,\n"
"HMM_MEM	const	struct ProbabilisticParams* probabilistic MAX_SIZE(sizeof(struct ProbabilisticParams)),\n"
"local				float* auxiliaryRows,\n"
"global				float* auxiliaryTwoLayers,\n"
"global				float* posterior,\n"
"global				float* columns)\n"
"{\n"
"#define j get_local_id(0)\n"
"int layerSize = jaggedSize(seq2Length + 1, seq1Length + 1, jagSize_float);\n"
"\n"
"// place forward matrix at layer 0, backward matrix at layer 1, treat posterior as auxiliary buffer\n"
"#define forward	(auxiliaryTwoLayers)\n"
"#define backward (auxiliaryTwoLayers + layerSize)\n"
"\n"
"Probabilistic_ComputeForwardMatrix_long(\n"
"seq1, seq2, seq1Length, seq2Length, scheduling, probabilistic, auxiliaryRows, forward, columns);\n"
"float totalProb = auxiliaryRows[0]; // get total probability from forward algorithm\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"Probabilistic_ComputeBackwardMatrix_long(\n"
"seq1, seq2, seq1Length, seq2Length, scheduling, probabilistic, auxiliaryRows, backward, columns);\n"
"totalProb = (totalProb + auxiliaryRows[0]) / 2; // get total probability from backward algorithm and combine it with forward-one\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"#ifdef RETURN_GLOBAL\n"
"// restore original values\n"
"if (j == 0) {\n"
"int lastIndex = jaggedIndex_float(seq2Length, seq1Length, seq1Length + 1);\n"
"forward[0] = LOG_ZERO;\n"
"backward[lastIndex] = probabilistic->initial[0];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"// compute posterior probability\n"
"Probabilistic_ComputePosteriorMatrix_long(seq1Length, seq2Length, totalProb, scheduling, forward, backward, posterior);\n"
"\n"
"// This is just to return total probability. Remember to set it to 0 in host!\n"
"#ifdef RETURN_GLOBAL\n"
"if (j == 0) {\n"
"posterior[0] = totalProb;\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"#endif\n"
"\n"
"#undef j\n"
"#undef forward\n"
"#undef backward\n"
"}\n"
,
"#define RND_MAX 65536\n"
"#define RND_MAX_INV 0.000015298473212373405134167610072515f\n"
"\n"
"int parkmiller(int seed) // 1<=seed<m\n"
"{\n"
"int const a = 75;\n"
"int const m = RND_MAX + 1;\n"
"seed = ((seed * a)) % m;\n"
"\n"
"return seed;\n"
"}\n"
,
"#if defined(cl_khr_fp64) //&& __OPENCL_VERSION__ == 10	// Khronos extension available?\n"
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"#elif defined(cl_amd_fp64) //&& __OPENCL_VERSION__ == 10 // AMD extension available?\n"
"#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
"#endif\n"
"\n"
"#define assert(x)	 // empty function\n"
"\n"
"// some constants\n"
"#define NumMatchStates		1\n"
"#define NumInsertStates 	2\n"
"#define NumMatrixTypes		5\n"
"#define SymbolCountSmall	26\n"
"#define depth				NumMatrixTypes\n"
"\n"
"// define special memory qualifiers\n"
"#ifdef COPY_PROBABILISTIC_PARAMS\n"
"#define HMM_MEM local\n"
"#define MAX_SIZE(x)  //__attribute__((max_constant_size (x)))\n"
"#else\n"
"#define HMM_MEM global\n"
"#define MAX_SIZE(x)\n"
"#endif\n"
"\n"
"#ifdef COPY_PARTITION_PARAMS\n"
"#define PART_MEM local\n"
"#define MAX_SIZE(x)  //__attribute__((max_constant_size (x)))\n"
"#else\n"
"#define PART_MEM global\n"
"#define MAX_SIZE(x)\n"
"#endif\n"
"\n"
"\n"
"#define fast_mad(x, y, z) mad24(x, y, z)\n"
"#define fast_mul(x, y) mul24(x, y)\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
,
"\n"
"typedef int index_type;\n"
"\n"
"typedef int column_type;\n"
"typedef float value_type;\n"
"typedef struct cell_type{\n"
"column_type column;\n"
"value_type value;\n"
"};\n"
"\n"
"/*\n"
"typedef int relax_column_type;\n"
"typedef float relax_value_type;\n"
"typedef struct {\n"
"relax_column_type column;\n"
"relax_value_type value;\n"
"} relax_cell_type;\n"
"typedef cell_type relax_cell_type;\n"
"\n"
"*/\n"
"\n"
"// basic accessors\n"
"global float* sparse_numCells(global float* ptr) {\n"
"return ptr + 1;\n"
"}\n"
"\n"
"global index_type* sparse_rowSizes(const global float * const ptr) {\n"
"return (global index_type*)(ptr + 2);\n"
"}\n"
"\n"
"global index_type* sparse_rowIndices(const global float * const ptr, const int height) {\n"
"return (global index_type*)(ptr + 2) + height;\n"
"}\n"
"\n"
"global struct cell_type* sparse_data(const global float * const ptr, const int height) {\n"
"return (global struct cell_type*)((global index_type*)(ptr + 2) + 2 * height);\n"
"}\n"
"\n"
"global struct cell_type* sparse_cell(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data(ptr, height) + sparse_rowIndices(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"// fixed point accessors\n"
"global ushort2* sparse_data_fixed(const global float * const ptr, const int height) {\n"
"return (global ushort2*)((global index_type*)(ptr + 2) + 2 * height);\n"
"}\n"
"\n"
"global ushort2* sparse_cell_fixed(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data_fixed(ptr, height) + sparse_rowIndices(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"// offset accessors\n"
"global index_type* sparse_rowSizes_offset(const global float * const ptr) {\n"
"return (global index_type*)(ptr);\n"
"}\n"
"\n"
"global index_type* sparse_rowIndices_offset(const global float * const ptr, const int height) {\n"
"return (global index_type*)(ptr) + height;\n"
"}\n"
"\n"
"global struct cell_type* sparse_data_offset(const global float * const ptr, const int height) {\n"
"return (global struct cell_type*)((global index_type*)(ptr) + 2 * height);\n"
"}\n"
"\n"
"global struct cell_type* sparse_cell_offset(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data_offset(ptr, height) + sparse_rowIndices_offset(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"__kernel void SparseMatrix_Generate(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	float* posterior,\n"
"local			int* verticalBuffer,\n"
"global			float* sparse)\n"
"{\n"
"global index_type* out_rowSizes = sparse_rowSizes(sparse);\n"
"global index_type* out_rowIndices = sparse_rowIndices(sparse, seq1Length + 1);\n"
"global struct cell_type* out_data =  sparse_data(sparse, seq1Length + 1);\n"
"\n"
"int numCells = 0;\n"
"int maxRowSize = 0;\n"
"int j = get_local_id(0);\n"
"\n"
"int index = jaggedIndex(j, -j + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"global const float *p = posterior + index;\n"
"\n"
"// calculate number of non-zero elements in each row\n"
"// vertical buffer stores size of rows\n"
"int iterationsCount  = seq1Length + seq2Length; // omit 0'th row\n"
"for (int iter = 0; iter < iterationsCount; ++iter)\n"
"{\n"
"int i = iter - j + 1;\n"
"if (i > 0 && i <= seq1Length)\n"
"{\n"
"if (j == 0)	{ verticalBuffer[i] = 0; }\n"
"else if (j <= seq2Length && *p >= POSTERIOR_CUTOFF) { ++verticalBuffer[i];  }\n"
"}\n"
"\n"
"p += jagSize_float;\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"\n"
"// copy verticalBuffer to rowSizes\n"
"for (int q = j; q <= seq1Length; q += get_local_size(0)) {\n"
"out_rowSizes[q] = verticalBuffer[q];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// calculate starting indices in sparse matrix\n"
"// vertical buffer stores row pointers\n"
"if (j == 0) {\n"
"verticalBuffer[0] = 0;\n"
"\n"
"for (int i = 1; i <= seq1Length; ++i) {\n"
"int size = verticalBuffer[i];\n"
"maxRowSize = max(maxRowSize, size);\n"
"verticalBuffer[i] = numCells;\n"
"numCells += size;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy verticalBuffer to rowIndices\n"
"for (int q = j; q <= seq1Length; q += get_local_size(0)) {\n"
"out_rowIndices[q] = verticalBuffer[q];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"\n"
"// fill in data matrix\n"
"p = posterior + index;\n"
"for (int iter = 0; iter < iterationsCount; ++iter) {\n"
"int i = iter - j + 1;\n"
"if (i > 0 && i <= seq1Length && j > 0 && j <= seq2Length) {\n"
"float v = *p;\n"
"if (v >= POSTERIOR_CUTOFF) {\n"
"out_data[verticalBuffer[i]].column = j;\n"
"out_data[verticalBuffer[i]].value = v;\n"
"++verticalBuffer[i];\n"
"}\n"
"}\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"\n"
"if (j == 0) {\n"
"out_rowIndices[0] = numCells; // store number of non-zero cells here\n"
"out_rowSizes[0] = maxRowSize;	// store maximum row size\n"
"\n"
"sparse[0] = posterior[0];	// store distances\n"
"sparse[1] = numCells;		// store number of non-zero elements\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel void SparseMatrix_GenerateTranspose(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	float* posterior,\n"
"local			int* horizontalBuffer,\n"
"global			float* sparse)\n"
"{\n"
"global index_type* out_rowSizes = sparse_rowSizes(sparse);\n"
"global index_type* out_rowIndices = sparse_rowIndices(sparse, seq2Length + 1);\n"
"global struct cell_type* out_data =  sparse_data(sparse, seq2Length + 1);\n"
"\n"
"int numCells = 0;\n"
"int maxRowSize = 0;\n"
"int j = get_local_id(0);\n"
"\n"
"int rowSize = 0;\n"
"int index = jaggedIndex(j, -j + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"global const float *p = posterior + index;\n"
"\n"
"// calculate number of non-zero elements in each column\n"
"int iterationsCount  = seq1Length + seq2Length; // omit 0'th column\n"
"for (int iter = 0; iter < iterationsCount; ++iter) {\n"
"int i = iter - j + 1;\n"
"if (i > 0 && i <= seq1Length) {\n"
"if (j > 0 && j <= seq2Length && *p >= POSTERIOR_CUTOFF) { ++rowSize; }\n"
"}\n"
"\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"// copy register value to rowSizes and horizontalBuffer\n"
"if (j <= seq2Length) { out_rowSizes[j] = horizontalBuffer[j] = rowSize; }\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// calculate starting indices in sparse matrix\n"
"if (j == 0) {\n"
"for (int x = 1; x <= seq2Length; ++x) {\n"
"int size = horizontalBuffer[x];\n"
"maxRowSize = max(maxRowSize, size);\n"
"horizontalBuffer[x] = numCells;\n"
"numCells += size;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy verticalBuffer to rowIndices\n"
"if (j <= seq2Length) { out_rowIndices[j] = horizontalBuffer[j]; }\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// fill in data matrix\n"
"p = posterior + index;\n"
"for (int iter = 0; iter < iterationsCount; ++iter) {\n"
"int i = iter - j + 1;\n"
"if (i > 0 && i <= seq1Length && j > 0 && j <= seq2Length) {\n"
"float v = *p;\n"
"if (v >= POSTERIOR_CUTOFF) {\n"
"out_data[horizontalBuffer[j]].column = i;\n"
"out_data[horizontalBuffer[j]].value = v;\n"
"++horizontalBuffer[j];\n"
"}\n"
"}\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"\n"
"if (j == 0) {\n"
"out_rowIndices[0] = numCells; // store number of non-zero cells here\n"
"out_rowSizes[0] = maxRowSize;	// store maximum row size\n"
"\n"
"sparse[0] = posterior[0];	// store distances\n"
"sparse[1] = numCells;		// store number of non-zero elements\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel void SparseMatrix_ToDense(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	struct cell_type* data,\n"
"global	const	int* rowSizes,\n"
"global			float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"int index = jaggedIndex(j, -j, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"\n"
"int iterationsCount  = seq1Length + seq2Length + 1;\n"
"for (int iter = 0; iter < iterationsCount; ++iter) {\n"
"int i = iter - j;\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"posterior[index] = 0;\n"
"}\n"
"\n"
"index += jagSize_float;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"global struct cell_type* rowPtr = data;\n"
"for (int i = 1; i <= seq1Length; i++) {\n"
"if (j < rowSizes[i]) {\n"
"int index = jaggedIndex(rowPtr[j].column, i, seq1Length + 1, jagSize_float);\n"
"posterior[index] = rowPtr[j].value;\n"
"}\n"
"\n"
"rowPtr += rowSizes[i];\n"
"}\n"
"}\n"
,
"\n"
"typedef int index_type;\n"
"\n"
"typedef int column_type;\n"
"typedef float value_type;\n"
"typedef struct cell_type{\n"
"column_type column;\n"
"value_type value;\n"
"};\n"
"\n"
"/*\n"
"typedef int relax_column_type;\n"
"typedef float relax_value_type;\n"
"typedef struct {\n"
"relax_column_type column;\n"
"relax_value_type value;\n"
"} relax_cell_type;\n"
"typedef cell_type relax_cell_type;\n"
"\n"
"*/\n"
"\n"
"// basic accessors\n"
"global float* sparse_numCells(global float* ptr) {\n"
"return ptr + 1;\n"
"}\n"
"\n"
"global index_type* sparse_rowSizes(const global float * const ptr) {\n"
"return (global index_type*)(ptr + 2);\n"
"}\n"
"\n"
"global index_type* sparse_rowIndices(const global float * const ptr, const int height) {\n"
"return (global index_type*)(ptr + 2) + height;\n"
"}\n"
"\n"
"global struct cell_type* sparse_data(const global float * const ptr, const int height) {\n"
"return (global struct cell_type*)((global index_type*)(ptr + 2) + 2 * height);\n"
"}\n"
"\n"
"global struct cell_type* sparse_cell(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data(ptr, height) + sparse_rowIndices(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"// fixed point accessors\n"
"global ushort2* sparse_data_fixed(const global float * const ptr, const int height) {\n"
"return (global ushort2*)((global index_type*)(ptr + 2) + 2 * height);\n"
"}\n"
"\n"
"global ushort2* sparse_cell_fixed(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data_fixed(ptr, height) + sparse_rowIndices(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"// offset accessors\n"
"global index_type* sparse_rowSizes_offset(const global float * const ptr) {\n"
"return (global index_type*)(ptr);\n"
"}\n"
"\n"
"global index_type* sparse_rowIndices_offset(const global float * const ptr, const int height) {\n"
"return (global index_type*)(ptr) + height;\n"
"}\n"
"\n"
"global struct cell_type* sparse_data_offset(const global float * const ptr, const int height) {\n"
"return (global struct cell_type*)((global index_type*)(ptr) + 2 * height);\n"
"}\n"
"\n"
"global struct cell_type* sparse_cell_offset(const global float * const ptr, const int height, const int rowId, const int laneId) {\n"
"return sparse_data_offset(ptr, height) + sparse_rowIndices_offset(ptr, height)[rowId] + laneId;\n"
"}\n"
"\n"
"\n"
"__kernel void SparseMatrix_Generate_long(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	float* posterior,\n"
"local			int* verticalBuffer,\n"
"global			float* sparse)\n"
"{\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"int j_offset = get_local_id(0);\n"
"\n"
"global index_type* out_rowSizes = sparse_rowSizes(sparse);\n"
"global index_type* out_rowIndices = sparse_rowIndices(sparse, seq1Length + 1);\n"
"global struct cell_type* out_data =  sparse_data(sparse, seq1Length + 1);\n"
"\n"
"int numCells = 0;\n"
"int maxRowSize = 0;\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(width, get_local_size(0));\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int index = jaggedIndex(j, -j_offset + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"global const float *p = posterior + index;\n"
"\n"
"// calculate number of non-zero elements in each row\n"
"// vertical buffer stores size of rows\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"--iterCount; // omit 0'th column\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter)\n"
"{\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length)\n"
"{\n"
"if (j == 0)	{ verticalBuffer[i] = 0; }\n"
"else if (j <= seq2Length && *p >= POSTERIOR_CUTOFF) { ++verticalBuffer[i];  }\n"
"}\n"
"\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"// copy verticalBuffer to rowSizes\n"
"for (int q = j_offset; q <= seq1Length; q += get_local_size(0)) {\n"
"out_rowSizes[q] = verticalBuffer[q];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// calculate starting indices in sparse matrix\n"
"if (j_offset == 0) {\n"
"verticalBuffer[0] = 0;\n"
"\n"
"for (int i = 0; i <= seq1Length; ++i) {\n"
"int size = verticalBuffer[i];\n"
"maxRowSize = max(maxRowSize, size);\n"
"verticalBuffer[i] = numCells;\n"
"numCells += size;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy verticalBuffer to rowIndices\n"
"for (int q = j_offset; q <= seq1Length; q += get_local_size(0)) {\n"
"out_rowIndices[q] = verticalBuffer[q];\n"
"}\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"j = j_offset;\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int index = jaggedIndex(j, -j_offset + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"--iterCount; // omit 0'th column\n"
"\n"
"// fill in data matrix\n"
"global const float* p = posterior + index;\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length && j > 0 && j <= seq2Length) {\n"
"float v = *p;\n"
"if (v >= POSTERIOR_CUTOFF) {\n"
"out_data[verticalBuffer[i]].column = j;\n"
"out_data[verticalBuffer[i]].value = v;\n"
"++verticalBuffer[i];\n"
"}\n"
"}\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"if (j_offset == 0) {\n"
"out_rowIndices[0] = numCells; // store number of non-zero cells here\n"
"out_rowSizes[0] = maxRowSize;	// store maximum row size\n"
"\n"
"sparse[0] = posterior[0];	// store distances\n"
"sparse[1] = numCells;		// store number of non-zero elements\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel void SparseMatrix_Generate_verylong(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	float* posterior,\n"
"local			int* verticalBuffer,\n"
"global			float* sparse)\n"
"{\n"
"#define width (seq2Length + 1)\n"
"#define height (seq1Length + 1)\n"
"int j_offset = get_local_id(0);\n"
"\n"
"global index_type* out_rowSizes = sparse_rowSizes(sparse);\n"
"global index_type* out_rowIndices = sparse_rowIndices(sparse, seq1Length + 1);\n"
"global struct cell_type* out_data =  sparse_data(sparse, seq1Length + 1);\n"
"\n"
"int numCells = 0;\n"
"int maxRowSize = 0;\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(width, get_local_size(0));\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int index = jaggedIndex(j, -j_offset + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"global const float *p = posterior + index;\n"
"\n"
"// calculate number of non-zero elements in each row\n"
"// vertical buffer stores size of rows\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"--iterCount; // omit 0'th column\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter)\n"
"{\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length)\n"
"{\n"
"if (j == 0)	{ out_rowSizes[i] = 0; }\n"
"else if (j <= seq2Length && *p >= POSTERIOR_CUTOFF) { ++out_rowSizes[i];  }\n"
"}\n"
"\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"// calculate starting indices in sparse matrix\n"
"if (j_offset == 0) {\n"
"out_rowIndices[0] = 0;\n"
"out_rowSizes[0] = 0;\n"
"\n"
"for (int i = 0; i <= seq1Length; ++i) {\n"
"int size = out_rowSizes[i];\n"
"maxRowSize = max(maxRowSize, size);\n"
"out_rowIndices[i] = numCells;\n"
"numCells += size;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"j = j_offset;\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int index = jaggedIndex(j, -j_offset + 1, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"--iterCount; // omit 0'th column\n"
"\n"
"// fill in data matrix\n"
"global const float* p = posterior + index;\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length && j > 0 && j <= seq2Length) {\n"
"float v = *p;\n"
"if (v >= POSTERIOR_CUTOFF) {\n"
"out_data[out_rowIndices[i]].column = j;\n"
"out_data[out_rowIndices[i]].value = v;\n"
"++out_rowIndices[i];\n"
"}\n"
"}\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"// restore offsets\n"
"for (int i = j_offset; i <= seq1Length; i += get_local_size(0)) {\n"
"out_rowIndices[i] -= out_rowSizes[i];\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"if (j_offset == 0) {\n"
"out_rowIndices[0] = numCells; // store number of non-zero cells here\n"
"out_rowSizes[0] = maxRowSize;	// store maximum row size\n"
"\n"
"sparse[0] = posterior[0];	// store distances\n"
"sparse[1] = numCells;		// store number of non-zero elements\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"__kernel void SparseMatrix_GenerateTranspose_long(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	float* posterior,\n"
"local			int* horizontalBuffer,\n"
"global			float* sparse)\n"
"{\n"
"int j_offset = get_local_id(0);\n"
"\n"
"global index_type* out_rowSizes = sparse_rowSizes(sparse);\n"
"global index_type* out_rowIndices = sparse_rowIndices(sparse, seq2Length + 1);\n"
"global struct cell_type* out_data =  sparse_data(sparse, seq2Length + 1);\n"
"\n"
"int numCells = 0;\n"
"int maxRowSize = 0;\n"
"\n"
"int j = j_offset;\n"
"int lanesCount = _ceildiv(seq2Length + 1, get_local_size(0));\n"
"\n"
"for (int lane = 0; lane < lanesCount; ++lane, j += get_local_size(0)) {\n"
"\n"
"int rowSize = 0;\n"
"int index = jaggedIndex(j, -j_offset + 1, seq1Length + 1, jagSize_float);		// omit 0'th row\n"
"global const float *p = posterior + index;\n"
"\n"
"// calculate number of non-zero elements in each column\n"
"int iterCount = seq1Length + min(get_local_size(0), (uint)(seq2Length + 1) - lane * get_local_size(0));\n"
"--iterCount; // omit 0'th column\n"
"\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length) {\n"
"if (j > 0 && j <= seq2Length && *p >= POSTERIOR_CUTOFF) { ++rowSize; }\n"
"}\n"
"\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"// copy register value to rowSizes and horizontalBuffer\n"
"if (j <= seq2Length) { out_rowSizes[j] = horizontalBuffer[j_offset] = rowSize; }\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// calculate starting indices in sparse matrix\n"
"if (j_offset == 0) {\n"
"int end = min((int)get_local_size(0), seq2Length + 1 - lane * (int)get_local_size(0));\n"
"for (int x = 0; x < end; ++x) {\n"
"int size = horizontalBuffer[x];\n"
"maxRowSize = max(maxRowSize, size);\n"
"horizontalBuffer[x] = numCells;\n"
"numCells += size;\n"
"}\n"
"}\n"
"\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
"// copy horizontal to rowIndices\n"
"if (j <= seq2Length) { out_rowIndices[j] = horizontalBuffer[j_offset]; }\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"// fill in data matrix\n"
"p = posterior + index;\n"
"for (int iter = 0; iter < iterCount; ++iter) {\n"
"int i = iter - j_offset + 1;\n"
"if (i > 0 && i <= seq1Length && j > 0 && j <= seq2Length) {\n"
"float v = *p;\n"
"if (v >= POSTERIOR_CUTOFF) {\n"
"out_data[horizontalBuffer[j_offset]].column = i;\n"
"out_data[horizontalBuffer[j_offset]].value = v;\n"
"++horizontalBuffer[j_offset];\n"
"}\n"
"}\n"
"p += jagSize_float;\n"
"barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);\n"
"}\n"
"}\n"
"\n"
"if (j_offset == 0) {\n"
"out_rowIndices[0] = numCells; // store number of non-zero cells here\n"
"out_rowSizes[0] = maxRowSize;	// store maximum row size\n"
"\n"
"sparse[0] = posterior[0];	// store distances\n"
"sparse[1] = numCells;		// store number of non-zero elements\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"/*\n"
"__kernel void SparseMatrix_ToDense(\n"
"int seq1Length,\n"
"int seq2Length,\n"
"global	const	struct cell_type* data,\n"
"global	const	int* rowSizes,\n"
"global			float* posterior)\n"
"{\n"
"int j = get_local_id(0);\n"
"int index = jaggedIndex(j, -j, seq1Length + 1, jagSize_float);			// seek to -j'th row\n"
"\n"
"int iterationsCount  = seq1Length + seq2Length + 1;\n"
"for (int iter = 0; iter < iterationsCount; ++iter) {\n"
"int i = iter - j;\n"
"\n"
"if (i >= 0 && i <= seq1Length && j <= seq2Length) {\n"
"posterior[index] = 0;\n"
"}\n"
"\n"
"index += jagSize_float;\n"
"}\n"
"\n"
"barrier(CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"global struct cell_type* rowPtr = data;\n"
"for (int i = 1; i <= seq1Length; i++) {\n"
"if (j < rowSizes[i]) {\n"
"int index = jaggedIndex(rowPtr[j].column, i, seq1Length + 1, jagSize_float);\n"
"posterior[index] = rowPtr[j].value;\n"
"}\n"
"\n"
"rowPtr += rowSizes[i];\n"
"}\n"
"}\n"
"*/\n"
,
"#pragma once\n"
"\n"
"#define _sqr(a) (a)*(a)\n"
"\n"
"#define _max(a,b)		((a) >= (b) ? (a) : (b))\n"
"#define _min(a,b)		((a) <= (b) ? (a) : (b))\n"
"#define _min3(a,b,c)	_min(_min(a,b),c)\n"
"#define _max3(a,b,c)	_max(_max(a,b),c)\n"
"#define _min4(a,b,c,d)	_min(_min(a,b),_min(c,d))\n"
"#define _max4(a,b,c,d)	_max(_max(a,b),_max(c,d))\n"
"\n"
"// divides a by b and gets ceil of result\n"
"#define _ceildiv(a,b) (((a) + (b) - 1) / (b))\n"
"\n"
"// perform ceil rounding to the closest multiplicity of b\n"
"#define _ceilround(a,b) (((a) + (b) - 1) / (b) * (b))\n"
,
""
};